This is an archive of the discontinued LLVM Phabricator instance.

[x86] try harder to match bitwise 'or' into an LEA
ClosedPublic

Authored by spatel on Oct 21 2015, 1:43 PM.

Download Raw Diff

Details

Reviewers

qcolombet
chandlerc
silvas
kbsmith1
zansari

Commits

rG32538d681177: [x86] try harder to match bitwise 'or' into an LEA
rL252515: [x86] try harder to match bitwise 'or' into an LEA

Summary

The motivation for this patch starts with the epic fail example in PR18007:
https://llvm.org/bugs/show_bug.cgi?id=18007

...unfortunately, this patch makes no difference for that case, but it solves some simpler cases. We'll get there some day. :)

The current 'or' matching code was using computeKnownBits() via isBaseWithConstantOffset() -> MaskedValueIsZero(), but that's an unnecessarily limited use. We can do more by copying the logic in ValueTracking's haveNoCommonBitsSet(), so we can treat the 'or' as if it was an 'add'.

An example of the better LEA matching:

leal (%rdi,%rdi), %eax
andl $1, %esi
orl %esi, %eax

Becomes:

andl $1, %esi
leal (%rsi,%rdi,2), %eax

Diff Detail

Event Timeline

spatel updated this revision to Diff 38041.Oct 21 2015, 1:43 PM

spatel retitled this revision from to [x86] try harder to match bitwise 'or' into an LEA.

spatel updated this object.

spatel added reviewers: chandlerc, qcolombet, silvas.

spatel added a subscriber: llvm-commits.

Ping.

Ping * 2.

LGTM

This revision is now accepted and ready to land.Nov 9 2015, 11:29 AM

Hi Sanjay,

LGTM.

One comment though, I believe the problem with isBaseWithConstantOffset is not the use of MaskedValueIsZero, but the fact we expect a constant to feed that mask. I wonder if it could be of general use to refactor the logic so that it is available everywhere, like isOrSameAsAdd or something.

Cheers,
-Quentin

In D13956#285341, @qcolombet wrote:

One comment though, I believe the problem with isBaseWithConstantOffset is not the use of MaskedValueIsZero, but the fact we expect a constant to feed that mask. I wonder if it could be of general use to refactor the logic so that it is available everywhere, like isOrSameAsAdd or something.

Thanks, Kevin and Quentin. Yes, this is a good point. At the least, we should be able to share the logic with DAGCombiner. It is currently doing this:

// fold (a+b) -> (a|b) iff a and b share no bits.
if (VT.isInteger() && !VT.isVector()) {
  APInt LHSZero, LHSOne;
  APInt RHSZero, RHSOne;
  DAG.computeKnownBits(N0, LHSZero, LHSOne);

  if (LHSZero.getBoolValue()) {
    DAG.computeKnownBits(N1, RHSZero, RHSOne);

    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero){
      if (!LegalOperations || TLI.isOperationLegal(ISD::OR, VT))
        return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1);
    }
  }
}

Closed by commit rL252515: [x86] try harder to match bitwise 'or' into an LEA (authored by spatel). · Explain WhyNov 9 2015, 1:19 PM

This revision was automatically updated to reflect the committed changes.

For reference, I checked in the refactoring here:
http://reviews.llvm.org/rL252539

So now we have two "haveNoCommonBitsSet()" functions. We're going to kill the DAG, right? :)

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelDAGToDAG.cpp

29 lines

test/

CodeGen/

X86/

or-lea.ll

18 lines

x86-64-double-precision-shift-left.ll

15 lines

x86-64-double-precision-shift-right.ll

7 lines

Diff 38041

lib/Target/X86/X86ISelDAGToDAG.cpp

Show First 20 Lines • Show All 1,332 Lines • ▼ Show 20 Lines	case ISD::SUB: {
return false;		return false;
}		}

case ISD::ADD:		case ISD::ADD:
if (!matchAdd(N, AM, Depth))		if (!matchAdd(N, AM, Depth))
return false;		return false;
break;		break;

case ISD::OR:		case ISD::OR: {
// Handle "X \| C" as "X + C" iff X is known to have C bits clear.		// We want to look through a transform in InstCombine and DAGCombiner that
if (CurDAG->isBaseWithConstantOffset(N)) {		// turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
X86ISelAddressMode Backup = AM;		APInt LHSZero, LHSOne;
ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));		APInt RHSZero, RHSOne;
		CurDAG->computeKnownBits(N.getOperand(0), LHSZero, LHSOne);
// Start with the LHS as an addr mode.		CurDAG->computeKnownBits(N.getOperand(1), RHSZero, RHSOne);
if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
!foldOffsetIntoAddress(CN->getSExtValue(), AM))		// If we know that there are no common bits set by the operands of this
		// 'or', it is equivalent to an 'add'. For example:
		// (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
		// An 'lea' can then be used to match the shift (multiply) and add:
		// and $1, %esi
		// lea (%rsi, %rdi, 8), %rax
		if ((LHSZero \| RHSZero).isAllOnesValue())
		if (!matchAdd(N, AM, Depth))
return false;		return false;
AM = Backup;
}
break;		break;
		}

case ISD::AND: {		case ISD::AND: {
// Perform some heroic transforms on an and of a constant-count shift		// Perform some heroic transforms on an and of a constant-count shift
// with a constant to enable use of the scaled offset field.		// with a constant to enable use of the scaled offset field.

// Scale must not be used already.		// Scale must not be used already.
if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;		if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;

▲ Show 20 Lines • Show All 1,653 Lines • Show Last 20 Lines

test/CodeGen/X86/or-lea.ll

; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s		; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s

; InstCombine and DAGCombiner transform an 'add' into an 'or'		; InstCombine and DAGCombiner transform an 'add' into an 'or'
; if there are no common bits from the incoming operands.		; if there are no common bits from the incoming operands.
; LEA instruction selection should be able to see through that		; LEA instruction selection should be able to see through that
; transform and reduce add/shift/or instruction counts.		; transform and reduce add/shift/or instruction counts.

define i32 @or_shift1_and1(i32 %x, i32 %y) {		define i32 @or_shift1_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1:		; CHECK-LABEL: or_shift1_and1:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: addl %edi, %edi
; CHECK-NEXT: andl $1, %esi		; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi), %eax		; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq		; CHECK-NEXT: retq

%shl = shl i32 %x, 1		%shl = shl i32 %x, 1
%and = and i32 %y, 1		%and = and i32 %y, 1
%or = or i32 %and, %shl		%or = or i32 %and, %shl
ret i32 %or		ret i32 %or
}		}

define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {		define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1_swapped:		; CHECK-LABEL: or_shift1_and1_swapped:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: leal (%rdi,%rdi), %eax
; CHECK-NEXT: andl $1, %esi		; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: orl %esi, %eax		; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq		; CHECK-NEXT: retq

%shl = shl i32 %x, 1		%shl = shl i32 %x, 1
%and = and i32 %y, 1		%and = and i32 %y, 1
%or = or i32 %shl, %and		%or = or i32 %shl, %and
ret i32 %or		ret i32 %or
}		}

define i32 @or_shift2_and1(i32 %x, i32 %y) {		define i32 @or_shift2_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift2_and1:		; CHECK-LABEL: or_shift2_and1:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: leal (,%rdi,4), %eax
; CHECK-NEXT: andl $1, %esi		; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: orl %esi, %eax		; CHECK-NEXT: leal (%rsi,%rdi,4), %eax
; CHECK-NEXT: retq		; CHECK-NEXT: retq

%shl = shl i32 %x, 2		%shl = shl i32 %x, 2
%and = and i32 %y, 1		%and = and i32 %y, 1
%or = or i32 %shl, %and		%or = or i32 %shl, %and
ret i32 %or		ret i32 %or
}		}

define i32 @or_shift3_and1(i32 %x, i32 %y) {		define i32 @or_shift3_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and1:		; CHECK-LABEL: or_shift3_and1:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: leal (,%rdi,8), %eax
; CHECK-NEXT: andl $1, %esi		; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: orl %esi, %eax		; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq		; CHECK-NEXT: retq

%shl = shl i32 %x, 3		%shl = shl i32 %x, 3
%and = and i32 %y, 1		%and = and i32 %y, 1
%or = or i32 %shl, %and		%or = or i32 %shl, %and
ret i32 %or		ret i32 %or
}		}

define i32 @or_shift3_and7(i32 %x, i32 %y) {		define i32 @or_shift3_and7(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and7:		; CHECK-LABEL: or_shift3_and7:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: leal (,%rdi,8), %eax
; CHECK-NEXT: andl $7, %esi		; CHECK-NEXT: andl $7, %esi
; CHECK-NEXT: orl %esi, %eax		; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq		; CHECK-NEXT: retq

%shl = shl i32 %x, 3		%shl = shl i32 %x, 3
%and = and i32 %y, 7		%and = and i32 %y, 7
%or = or i32 %shl, %and		%or = or i32 %shl, %and
ret i32 %or		ret i32 %or
}		}

Show All 29 Lines	; CHECK-NEXT: retq
ret i32 %or		ret i32 %or
}		}

; 64-bit operands should work too.		; 64-bit operands should work too.

define i64 @or_shift1_and1_64(i64 %x, i64 %y) {		define i64 @or_shift1_and1_64(i64 %x, i64 %y) {
; CHECK-LABEL: or_shift1_and1_64:		; CHECK-LABEL: or_shift1_and1_64:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: addq %rdi, %rdi
; CHECK-NEXT: andl $1, %esi		; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leaq (%rsi,%rdi), %rax		; CHECK-NEXT: leaq (%rsi,%rdi,2), %rax
; CHECK-NEXT: retq		; CHECK-NEXT: retq

%shl = shl i64 %x, 1		%shl = shl i64 %x, 1
%and = and i64 %y, 1		%and = and i64 %y, 1
%or = or i64 %and, %shl		%or = or i64 %and, %shl
ret i64 %or		ret i64 %or
}		}

test/CodeGen/X86/x86-64-double-precision-shift-left.ll

	; RUN: llc < %s -march=x86-64 -mcpu=bdver1 \| FileCheck %s			; RUN: llc < %s -march=x86-64 -mcpu=bdver1 \| FileCheck %s
	; Verify that for the architectures that are known to have poor latency			; Verify that for the architectures that are known to have poor latency
	; double precision shift instructions we generate alternative sequence			; double precision shift instructions we generate alternative sequence
	; of instructions with lower latencies instead of shld instruction.			; of instructions with lower latencies instead of shld instruction.

	;uint64_t lshift1(uint64_t a, uint64_t b)			;uint64_t lshift1(uint64_t a, uint64_t b)
	;{			;{
	; return (a << 1) \| (b >> 63);			; return (a << 1) \| (b >> 63);
	;}			;}

	; CHECK: lshift1:			; CHECK-LABEL: lshift1:
	; CHECK: addq {{.}},{{.}}			; CHECK: shrq $63, %rsi
	; CHECK-NEXT: shrq $63, {{.*}}			; CHECK-NEXT: leaq (%rsi,%rdi,2), %rax
	; CHECK-NEXT: leaq ({{.}},{{.}}), {{.*}}


	define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {			define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
	entry:			entry:
	%shl = shl i64 %a, 1			%shl = shl i64 %a, 1
	%shr = lshr i64 %b, 63			%shr = lshr i64 %b, 63
	%or = or i64 %shr, %shl			%or = or i64 %shr, %shl
	ret i64 %or			ret i64 %or
	}			}

	;uint64_t lshift2(uint64_t a, uint64_t b)			;uint64_t lshift2(uint64_t a, uint64_t b)
	;{			;{
	; return (a << 2) \| (b >> 62);			; return (a << 2) \| (b >> 62);
	;}			;}

	; CHECK: lshift2:			; CHECK-LABEL: lshift2:
	; CHECK: shlq $2, {{.*}}			; CHECK: shrq $62, %rsi
	; CHECK-NEXT: shrq $62, {{.*}}			; CHECK-NEXT: leaq (%rsi,%rdi,4), %rax
	; CHECK-NEXT: leaq ({{.}},{{.}}), {{.*}}

	define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {			define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
	entry:			entry:
	%shl = shl i64 %a, 2			%shl = shl i64 %a, 2
	%shr = lshr i64 %b, 62			%shr = lshr i64 %b, 62
	%or = or i64 %shr, %shl			%or = or i64 %shr, %shl
	ret i64 %or			ret i64 %or
	}			}
	Show All 36 Lines

test/CodeGen/X86/x86-64-double-precision-shift-right.ll

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable {
ret i64 %3		ret i64 %3
}		}

;uint64_t rshift63(uint64_t a, uint64_t b)		;uint64_t rshift63(uint64_t a, uint64_t b)
;{		;{
; return (a >> 63) \| (b << 1);		; return (a >> 63) \| (b << 1);
;}		;}

; CHECK: rshift63:		; CHECK-LABEL: rshift63:
; CHECK: shrq $63, {{.*}}		; CHECK: shrq $63, %rdi
; CHECK-NEXT: leaq ({{.}},{{.}}), {{.*}}		; CHECK-NEXT: leaq (%rdi,%rsi,2), %rax
; CHECK-NEXT: orq {{.}}, {{.}}

define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {		define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {
%1 = lshr i64 %a, 63		%1 = lshr i64 %a, 63
%2 = shl i64 %b, 1		%2 = shl i64 %b, 1
%3 = or i64 %2, %1		%3 = or i64 %2, %1
ret i64 %3		ret i64 %3
}		}