This is an archive of the discontinued LLVM Phabricator instance.

[instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic
ClosedPublic

Authored by reames on Sep 17 2020, 3:20 PM.

Download Raw Diff

Details

Reviewers

anna
craig.topper

Commits

rG06f136f61e6d: [instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic

Summary

If the mask of a pdep or pext instruction is a shift masked (i.e. one contiguous block of ones) we need at most one and and one shift to represent the operation without the intrinsic. One all platforms I know of, this is faster than the pdep/pext.

The cost modelling for multiple contiguous blocks might be worth exploring in a follow up, but it's not relevant for my current use case. It would almost certainly be a win on AMDs where these are really really slow though.

Diff Detail

Event Timeline

reames created this revision.Sep 17 2020, 3:20 PM

Herald added a project: Restricted Project. · View Herald TranscriptSep 17 2020, 3:20 PM

Herald added subscribers: dantrushin, bollu, hiraditya, mcrosier. · View Herald Transcript

reames requested review of this revision.Sep 17 2020, 3:20 PM

Harbormaster completed remote builds in B72095: Diff 292636.Sep 17 2020, 3:54 PM

LGTM

This revision is now accepted and ready to land.Sep 18 2020, 10:07 AM

thanks for this Philip!

This revision was landed with ongoing or failed builds.Sep 18 2020, 2:55 PM

Closed by commit rG06f136f61e6d: [instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic (authored by reames). · Explain Why

This revision was automatically updated to reflect the committed changes.

reames added a commit: rG06f136f61e6d: [instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic.

reames mentioned this in D88035: [AArch64] Teach analyzeBranch to remove branch equivelent to fallthrough.Sep 21 2020, 10:21 AM

reames mentioned this in rGe1a3271ebb87: [AArch64] Teach analyzeBranch to remove branch equivelent to fallthrough.Sep 22 2020, 2:38 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86InstCombineIntrinsic.cpp

26 lines

test/

Transforms/

InstCombine/

X86/

x86-bmi-tbm.ll

42 lines

Diff 292636

llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp

Show First 20 Lines • Show All 992 Lines • ▼ Show 20 Lines	case Intrinsic::x86_bmi_pext_64:
if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {		if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
if (MaskC->isNullValue()) {		if (MaskC->isNullValue()) {
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));		return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
}		}
if (MaskC->isAllOnesValue()) {		if (MaskC->isAllOnesValue()) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));		return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}		}

		if (MaskC->getValue().isShiftedMask()) {
		// any single contingous sequence of 1s anywhere in the mask simply
		// describes a subset of the input bits shifted to the appropriate
		// position. Replace with the straight forward IR.
		unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
		Value *Input = II.getArgOperand(0);
		Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
		Value *Shifted = IC.Builder.CreateLShr(Masked,
		ConstantInt::get(II.getType(),
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - Value Shifted = IC.Builder.CreateLShr(Masked, - ConstantInt::get(II.getType(), - ShiftAmount)); + Value Shifted = IC.Builder.CreateLShr( + Masked, ConstantInt::get(II.getType(), ShiftAmount)); Lint: Pre-merge checks: clang-format: please reformat the code ``` - Value *Shifted = IC.Builder.CreateLShr…
		ShiftAmount));
		return IC.replaceInstUsesWith(II, Shifted);
		}


if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {		if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - Lint: Pre-merge checks: clang-format: please reformat the code ``` - ```
uint64_t Src = SrcC->getZExtValue();		uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();		uint64_t Mask = MaskC->getZExtValue();
uint64_t Result = 0;		uint64_t Result = 0;
uint64_t BitToSet = 1;		uint64_t BitToSet = 1;

while (Mask) {		while (Mask) {
// Isolate lowest set bit.		// Isolate lowest set bit.
uint64_t BitToTest = Mask & -Mask;		uint64_t BitToTest = Mask & -Mask;
Show All 14 Lines	X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
case Intrinsic::x86_bmi_pdep_64:		case Intrinsic::x86_bmi_pdep_64:
if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {		if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
if (MaskC->isNullValue()) {		if (MaskC->isNullValue()) {
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));		return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
}		}
if (MaskC->isAllOnesValue()) {		if (MaskC->isAllOnesValue()) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));		return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}		}
		if (MaskC->getValue().isShiftedMask()) {
		// any single contingous sequence of 1s anywhere in the mask simply
		// describes a subset of the input bits shifted to the appropriate
		// position. Replace with the straight forward IR.
		unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
		Value *Input = II.getArgOperand(0);
		Value *Shifted = IC.Builder.CreateShl(Input,
		ConstantInt::get(II.getType(),
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - Value Shifted = IC.Builder.CreateShl(Input, - ConstantInt::get(II.getType(), - ShiftAmount)); + Value Shifted = IC.Builder.CreateShl( + Input, ConstantInt::get(II.getType(), ShiftAmount)); Lint: Pre-merge checks: clang-format: please reformat the code ``` - Value *Shifted = IC.Builder.CreateShl(Input…
		ShiftAmount));
		Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
		return IC.replaceInstUsesWith(II, Masked);
		}

if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {		if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();		uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();		uint64_t Mask = MaskC->getZExtValue();
uint64_t Result = 0;		uint64_t Result = 0;
uint64_t BitToTest = 1;		uint64_t BitToTest = 1;

while (Mask) {		while (Mask) {
▲ Show 20 Lines • Show All 928 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll

	Show First 20 Lines • Show All 300 Lines • ▼ Show 20 Lines
	define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {			define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {
	; CHECK-LABEL: @test_x86_pext_64_allones_mask(			; CHECK-LABEL: @test_x86_pext_64_allones_mask(
	; CHECK-NEXT: ret i64 %x			; CHECK-NEXT: ret i64 %x
	;			;
	%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 -1)			%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 -1)
	ret i64 %1			ret i64 %1
	}			}

				define i32 @test_x86_pext_32_shifted_mask(i32 %x) nounwind readnone {
				; CHECK-LABEL: @test_x86_pext_32_shifted_mask(
				; CHECK-NEXT: %1 = lshr i32 %x, 1
				; CHECK-NEXT: %2 = and i32 %1, 3
				; CHECK-NEXT: ret i32 %2
				;
				%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 6)
				ret i32 %1
				}

				define i64 @test_x86_pext_64_shifted_mask(i64 %x) nounwind readnone {
				; CHECK-LABEL: @test_x86_pext_64_shifted_mask(
				; CHECK-NEXT: %1 = lshr i64 %x, 1
				; CHECK-NEXT: %2 = and i64 %1, 3
				; CHECK-NEXT: ret i64 %2
				;
				%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 6)
				ret i64 %1
				}


	define i32 @test_x86_pext_32_constant_fold() nounwind readnone {			define i32 @test_x86_pext_32_constant_fold() nounwind readnone {
	; CHECK-LABEL: @test_x86_pext_32_constant_fold(			; CHECK-LABEL: @test_x86_pext_32_constant_fold(
	; CHECK-NEXT: ret i32 30001			; CHECK-NEXT: ret i32 30001
	;			;
	%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 1985229328, i32 4042322160)			%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 1985229328, i32 4042322160)
	ret i32 %1			ret i32 %1
	}			}

	▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines
	define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {			define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {
	; CHECK-LABEL: @test_x86_pdep_64_allones_mask(			; CHECK-LABEL: @test_x86_pdep_64_allones_mask(
	; CHECK-NEXT: ret i64 %x			; CHECK-NEXT: ret i64 %x
	;			;
	%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 -1)			%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 -1)
	ret i64 %1			ret i64 %1
	}			}

				define i32 @test_x86_pdep_32_shifted_mask(i32 %x) nounwind readnone {
				; CHECK-LABEL: @test_x86_pdep_32_shifted_mask(
				; CHECK-NEXT: %1 = shl i32 %x, 2
				; CHECK-NEXT: %2 = and i32 %1, 12
				; CHECK-NEXT: ret i32 %2
				;
				%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 12)
				ret i32 %1
				}

				define i64 @test_x86_pdep_64_shifted_mask(i64 %x) nounwind readnone {
				; CHECK-LABEL: @test_x86_pdep_64_shifted_mask(
				; CHECK-NEXT: %1 = shl i64 %x, 2
				; CHECK-NEXT: %2 = and i64 %1, 12
				; CHECK-NEXT: ret i64 %2
				;
				%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 12)
				ret i64 %1
				}


	define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {			define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {
	; CHECK-LABEL: @test_x86_pdep_32_constant_fold(			; CHECK-LABEL: @test_x86_pdep_32_constant_fold(
	; CHECK-NEXT: ret i32 807407616			; CHECK-NEXT: ret i32 807407616
	;			;
	%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 1985229328, i32 4042322160)			%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 1985229328, i32 4042322160)
	ret i32 %1			ret i32 %1
	}			}

	Show All 23 Lines