This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
-
InstCombineAndOrXor.cpp
-
test/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
-
sub-ashr-or-to-icmp-select.ll

Differential D67800

[InstCombine] Fold a shifty implementation of clamp-to-allones
ClosedPublic

Authored by huihuiz on Sep 19 2019, 11:37 PM.

Download Raw Diff

Details

Reviewers

lebedev.ri
efriedma
spatel
kparzysz
bcahoon

Commits

rGa4dd98f2e90b: [InstCombine] Fold a shifty implementation of clamp-to-allones.
rL372678: [InstCombine] Fold a shifty implementation of clamp-to-allones.

Summary

Fold

or(ashr(subNSW(X, V), ScalarSizeInBits - 1), V)

into

V s> X ? -1 : V

https://rise4fun.com/Alive/SBew

clamp255 as common operator for image processing, can be implemented
in a shifty way "(255 - V) >> 31 | V & 255". Fold shift into select
enables more optimization, e.g., vmin generation for ARM target.

Diff Detail

Repository: rL LLVM

Event Timeline

huihuiz created this revision.Sep 19 2019, 11:37 PM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald TranscriptSep 19 2019, 11:37 PM

E.g., vmin generation for ARM target

test.c

static inline int clamp255(int v) {
  return (((255 - (v)) >> 31) | (v)) & 255;
}


void foo(const unsigned char* src1,
         const unsigned char* src2,
         unsigned char* dst,
                int width) {
  int i;
  for (i = 0; i < width; ++i) {
    int r = src1[i];
    int b = src2[i];
    dst[0] = (unsigned char)clamp255(r + b);
    dst ++ ;
  }
}

run: clang -cc1 -triple armv8.1a-linux-gnu -target-abi apcs-gnu -target-feature +neon -vectorize-loops -vectorize-slp -O2 -S -o - test.c -o -

you can see "vmin" generation

before this optimization, generate "vsub + vshr + vorr" instead.

huihuiz added a child revision: D67798: [NFC][InstCombine] Add tests for shifty implementation of image clamping..Sep 19 2019, 11:40 PM

lebedev.ri added inline comments.Sep 20 2019, 1:28 AM

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
2685–2688 ↗	(On Diff #220955)	same
2689 ↗	(On Diff #220955)	As per the alive, this isn't needed?

Similar to D67799

Scalar Test ---

X86 target:
Test input: Run : clang -O2 -target x86_64 -march=skylake -S clamp255.ll -o -

define i32 @clamp255(i32 %v) {
  %sub = sub nsw i32 255, %v
  %shr = ashr i32 %sub, 31
  %or = or i32 %shr, %v
  %and = and i32 %or, 255
  ret i32 %and
}

before:

clamp255:                               # @clamp255
# %bb.0:
        movl    $255, %eax
        subl    %edi, %eax
        sarl    $31, %eax
        orl     %edi, %eax
        movzbl  %al, %eax
        retq

After this optimization

clamp255:                               # @clamp255
# %bb.0:
        cmpl    $256, %edi              # imm = 0x100
        movl    $255, %eax
        cmovll  %edi, %eax
        movzbl  %al, %eax
        retq

AArch64 target
Same input; Run : clang -O2 -target aarch64 -march=armv8a -S clamp255.ll -o -
before

clamp255:                               // @clamp255
// %bb.0:
        mov     w8, #255
        sub     w8, w8, w0
        orr     w8, w0, w8, asr #31
        and     w0, w8, #0xff
        ret

After this optimization:

clamp255:                               // @clamp255
// %bb.0:
        cmp     w0, #255                // =255
        mov     w8, #255
        csel    w8, w0, w8, lt
        and     w0, w8, #0xff
        ret

ARM target:
Same input; Run : clang -O2 -target arm -march=armv8.1a -S clamp255.ll -o -
before:

clamp255:
        .fnstart
@ %bb.0:
        rsb     r1, r0, #255
        orr     r0, r0, r1, asr #31
        uxtb    r0, r0
        bx      lr

After this optimization:

clamp255:
        .fnstart
@ %bb.0:
        cmp     r0, #255
        movge   r0, #255
        uxtb    r0, r0
        bx      lr

Vector Test ---

X86 target:
Test input; Run: clang -O2 -target x86_64 -march=skylake -S clamp255-vec.ll -o -

define <4 x i32> @clamp255(<4 x i32> %v) {
  %sub = sub nsw <4 x i32> <i32 255, i32 255, i32 255, i32 255>, %v
  %shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
  %or = or <4 x i32> %shr, %v
  %and = and <4 x i32> %or, <i32 255, i32 255, i32 255, i32 255>
  ret <4 x i32> %and
}

before:

clamp255:                               # @clamp255
# %bb.0:
        vpbroadcastd    .LCPI0_0(%rip), %xmm1 # xmm1 = [255,255,255,255]
        vpsubd  %xmm0, %xmm1, %xmm1
        vpsrad  $31, %xmm1, %xmm1
        vpor    %xmm0, %xmm1, %xmm0
        vpand   .LCPI0_1(%rip), %xmm0, %xmm0
        retq

After this optimization

clamp255:                               # @clamp255
# %bb.0:
        vpbroadcastd    .LCPI0_0(%rip), %xmm1 # xmm1 = [255,255,255,255]
        vpminsd %xmm1, %xmm0, %xmm0
        vpand   .LCPI0_1(%rip), %xmm0, %xmm0
        retq

AArch64 target:
Same input; Run : clang -O2 -target aarch64 -march=armv8a -S clamp255-vec.ll -o -
before

clamp255:                               // @clamp255
// %bb.0:
        movi    v1.2d, #0x0000ff000000ff
        sub     v2.4s, v1.4s, v0.4s
        sshr    v2.4s, v2.4s, #31
        orr     v0.16b, v2.16b, v0.16b
        and     v0.16b, v0.16b, v1.16b
        ret

After this optimization

clamp255:                               // @clamp255
// %bb.0:
        movi    v1.2d, #0x0000ff000000ff
        smin    v0.4s, v0.4s, v1.4s
        and     v0.16b, v0.16b, v1.16b
        ret

ARM target
Same input; Run : clang -target arm-arm-none-eabi -mcpu=cortex-a57 -mfpu=neon-fp-armv8 -O2 -S clamp255-vec.ll -o -
before

clamp255:
        .fnstart
        vmov    d17, r2, r3
        vmov    d16, r0, r1
        vmov.i32        q9, #0xff
        vsub.i32        q10, q9, q8
        vshr.s32        q10, q10, #31
        vorr    q8, q10, q8
        vand    q8, q8, q9
        vmov    r0, r1, d16
        vmov    r2, r3, d17
        bx      lr

After this optimization

clamp255:
        .fnstart
        vmov    d17, r2, r3
        vmov    d16, r0, r1
        vmov.i32        q9, #0xff
        vmin.s32        q8, q8, q9
        vand    q8, q8, q9
        vmov    r0, r1, d16
        vmov    r2, r3, d17
        bx      lr

resolved reviews feedback

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
2689 ↗	(On Diff #220955)	Shift amount need to be type ScalarSizeInBits - 1, see https://rise4fun.com/Alive/nPP

lebedev.ri added inline comments.Sep 21 2019, 3:06 AM

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
2689 ↗	(On Diff #220955)	Err, i actually meant to talk about `C` - it should just be `%X`.

In D67800#1677674, @huihuiz wrote:
clamp255: # @clamp255
cmpl    $256, %edi              # imm = 0x100
movl    $255, %eax
cmovll  %edi, %eax
movzbl  %al, %eax
retq

This is not a clear win vs. what we had before. Check theoretical perf with llvm-mca using at least 1 Intel CPU and 1 AMD CPU. Similarly for AArch64 - do we expect 'csel' to have same latency/throughput as simple ALU ops on a variety of micro-architectures?

huihuiz updated this revision to Diff 221259.Sep 23 2019, 12:37 AM

huihuiz retitled this revision from [InstCombine] Fold a shifty implementation of clamp (e.g., clamp255). to [InstCombine] Fold a shifty implementation of clamp positive to allOnesValue..

huihuiz edited the summary of this revision. (Show Details)

llvm-mca performance result for general folding:

Scalar Tests ---

test input

define i32 @clamp255(i32 %v, i32 %x) {
  %sub = sub nsw i32 %x, %v
  %shr = ashr i32 %sub, 31
  %or = or i32 %shr, %v
  ret i32 %or
}

X86 skylake: cmov latency 1
clang clampPositiveToMinusOne.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake
before

Iterations:        100
Instructions:      500
Total Cycles:      159
Total uOps:        700

Dispatch Width:    6
uOps Per Cycle:    4.40
IPC:               3.14
Block RThroughput: 1.2


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movl  %esi, %eax
 1      1     0.25                        subl  %edi, %eax
 1      1     0.50                        sarl  $31, %eax
 1      1     0.25                        orl   %edi, %eax
 3      7     1.00                  U     retq

After

Iterations:        100
Instructions:      400
Total Cycles:      135
Total uOps:        600

Dispatch Width:    6
uOps Per Cycle:    4.44
IPC:               2.96
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        cmpl  %esi, %edi
 1      1     0.25                        movl  $-1, %eax
 1      1     0.50                        cmovlel       %edi, %eax
 3      7     1.00                  U     retq

AMD znver2
clang clampPositiveToMinusOne.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2
before

Iterations:        100
Instructions:      500
Total Cycles:      155
Total uOps:        600

Dispatch Width:    4
uOps Per Cycle:    3.87
IPC:               3.23
Block RThroughput: 1.5


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movl  %esi, %eax
 1      1     0.25                        subl  %edi, %eax
 1      1     0.25                        sarl  $31, %eax
 1      1     0.25                        orl   %edi, %eax
 2      1     0.50                  U     retq

After

Iterations:        100
Instructions:      400
Total Cycles:      137
Total uOps:        500

Dispatch Width:    4
uOps Per Cycle:    3.65
IPC:               2.92
Block RThroughput: 1.3


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        cmpl  %esi, %edi
 1      1     0.25                        movl  $-1, %eax
 1      1     0.25                        cmovlel       %edi, %eax
 2      1     0.50                  U     retq

AArch64 cortex-a57
clang clampPositiveToMinusOne.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57
before

Iterations:        100
Instructions:      300
Total Cycles:      303
Total uOps:        300

Dispatch Width:    3
uOps Per Cycle:    0.99
IPC:               0.99
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        sub   w8, w1, w0
 1      2     1.00                        orr   w0, w0, w8, asr #31
 1      1     1.00                  U     ret

After

Iterations:        100
Instructions:      300
Total Cycles:      203
Total uOps:        300

Dispatch Width:    3
uOps Per Cycle:    1.48
IPC:               1.48
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        cmp   w0, w1
 1      1     0.50                        csinv w0, w0, wzr, le
 1      1     1.00                  U     ret

Vector Test ---

Input

define <4 x i32> @clamp255(<4 x i32> %v, <4 x i32> %x) {
  %sub = sub nsw <4 x i32> %x, %v
  %shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
  %or = or <4 x i32> %shr, %v
  ret <4 x i32> %or
}

X86 skylake
clang clampPositiveToMinusOne-vec.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake

before

Iterations:        100
Instructions:      400
Total Cycles:      303
Total uOps:        600

Dispatch Width:    6
uOps Per Cycle:    1.98
IPC:               1.32
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.33                        vpsubd        %xmm0, %xmm1, %xmm1
 1      1     0.50                        vpsrad        $31, %xmm1, %xmm1
 1      1     0.33                        vpor  %xmm0, %xmm1, %xmm0
 3      7     1.00                  U     retq

After

Iterations:        100
Instructions:      300
Total Cycles:      203
Total uOps:        500

Dispatch Width:    6
uOps Per Cycle:    2.46
IPC:               1.48
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        vpcmpgtd      %xmm1, %xmm0, %xmm1
 1      1     0.33                        vpor  %xmm0, %xmm1, %xmm0
 3      7     1.00                  U     retq

AMD znver2
clang clampPositiveToMinusOne-vec.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2
before

Iterations:        100
Instructions:      400
Total Cycles:      303
Total uOps:        500

Dispatch Width:    4
uOps Per Cycle:    1.65
IPC:               1.32
Block RThroughput: 1.3


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        vpsubd        %xmm0, %xmm1, %xmm1
 1      1     0.25                        vpsrad        $31, %xmm1, %xmm1
 1      1     0.25                        vpor  %xmm0, %xmm1, %xmm0
 2      1     0.50                  U     retq

After

Iterations:        100
Instructions:      300
Total Cycles:      203
Total uOps:        400

Dispatch Width:    4
uOps Per Cycle:    1.97
IPC:               1.48
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        vpcmpgtd      %xmm1, %xmm0, %xmm1
 1      1     0.25                        vpor  %xmm0, %xmm1, %xmm0
 2      1     0.50                  U     retq

AArch64 cortex-a57
clang clampPositiveToMinusOne-vec.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57
before

Iterations:        100
Instructions:      400
Total Cycles:      903
Total uOps:        400

Dispatch Width:    3
uOps Per Cycle:    0.44
IPC:               0.44
Block RThroughput: 1.5


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      3     0.50                        sub   v1.4s, v1.4s, v0.4s
 1      3     0.50                        sshr  v1.4s, v1.4s, #31
 1      3     0.50                        orr   v0.16b, v1.16b, v0.16b
 1      1     1.00                  U     ret

After

Iterations:        100
Instructions:      300
Total Cycles:      603
Total uOps:        300

Dispatch Width:    3
uOps Per Cycle:    0.50
IPC:               0.50
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      3     0.50                        cmgt  v1.4s, v0.4s, v1.4s
 1      3     0.50                        orr   v0.16b, v0.16b, v1.16b
 1      1     1.00                  U     ret

Thanks, LG.

I do think this is the correct canonicalization;
All those mca numbers for non-constants seem good to me, @spatel ?

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
2678 ↗	(On Diff #221259)	Likewise, can we s/X/Y/ s/V/X/ them?

This revision is now accepted and ready to land.Sep 23 2019, 6:36 AM

Closed by commit rL372678: [InstCombine] Fold a shifty implementation of clamp-to-allones. (authored by huihuiz). · Explain WhySep 23 2019, 5:29 PM

This revision was automatically updated to reflect the committed changes.

spatel mentioned this in D111410: [InstCombine] generalize fold for mask-with-signbit-splat.Oct 8 2021, 10:42 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineAndOrXor.cpp

15 lines

test/

Transforms/

InstCombine/

sub-ashr-or-to-icmp-select.ll

59 lines

Diff 221446

llvm/trunk/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

Show First 20 Lines • Show All 2,671 Lines • ▼ Show 20 Lines	if (Op0->hasOneUse() && Op1->hasOneUse() &&
match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&		match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {		match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
Value *orTrue = Builder.CreateOr(A, C);		Value *orTrue = Builder.CreateOr(A, C);
Value *orFalse = Builder.CreateOr(B, D);		Value *orFalse = Builder.CreateOr(B, D);
return SelectInst::Create(X, orTrue, orFalse);		return SelectInst::Create(X, orTrue, orFalse);
}		}
}		}

		// or(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? -1 : X.
		{
		Value X, Y;
		const APInt *ShAmt;
		Type *Ty = I.getType();
		if (match(&I, m_c_Or(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
		m_APInt(ShAmt))),
		m_Deferred(X))) &&
		*ShAmt == Ty->getScalarSizeInBits() - 1) {
		Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
		return SelectInst::Create(NewICmpInst, ConstantInt::getAllOnesValue(Ty),
		X);
		}
		}

return nullptr;		return nullptr;
}		}

/// A ^ B can be specified using other logic ops in a variety of patterns. We		/// A ^ B can be specified using other logic ops in a variety of patterns. We
/// can fold these early and efficiently by morphing an existing instruction.		/// can fold these early and efficiently by morphing an existing instruction.
static Instruction *foldXorToXor(BinaryOperator &I,		static Instruction *foldXorToXor(BinaryOperator &I,
InstCombiner::BuilderTy &Builder) {		InstCombiner::BuilderTy &Builder) {
assert(I.getOpcode() == Instruction::Xor);		assert(I.getOpcode() == Instruction::Xor);
▲ Show 20 Lines • Show All 549 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/InstCombine/sub-ashr-or-to-icmp-select.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -instcombine %s -S -o - \| FileCheck %s			; RUN: opt -instcombine %s -S -o - \| FileCheck %s

	; Clamp positive to allOnes:			; Clamp positive to allOnes:
	; E.g., clamp255 implemented in a shifty way, could be optimized as v > 255 ? 255 : v, where sub hasNoSignedWrap.			; E.g., clamp255 implemented in a shifty way, could be optimized as v > 255 ? 255 : v, where sub hasNoSignedWrap.
	; int32 clamp255(int32 v) {			; int32 clamp255(int32 v) {
	; return (((255 - (v)) >> 31) \| (v)) & 255;			; return (((255 - (v)) >> 31) \| (v)) & 255;
	; }			; }
	;			;

	; Scalar Types			; Scalar Types

	define i32 @clamp255_i32(i32 %x) {			define i32 @clamp255_i32(i32 %x) {
	; CHECK-LABEL: @clamp255_i32(			; CHECK-LABEL: @clamp255_i32(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 255, [[X:%.]]			; CHECK-NEXT: [[TMP1:%.]] = icmp slt i32 [[X:%.]], 255
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 255
	; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[X]]
	; CHECK-NEXT: [[AND:%.*]] = and i32 [[OR]], 255			; CHECK-NEXT: [[AND:%.*]] = and i32 [[OR]], 255
	; CHECK-NEXT: ret i32 [[AND]]			; CHECK-NEXT: ret i32 [[AND]]
	;			;
	%sub = sub nsw i32 255, %x			%sub = sub nsw i32 255, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%or = or i32 %shr, %x			%or = or i32 %shr, %x
	%and = and i32 %or, 255			%and = and i32 %or, 255
	ret i32 %and			ret i32 %and
	}			}

	define i8 @sub_ashr_or_i8(i8 %x, i8 %y) {			define i8 @sub_ashr_or_i8(i8 %x, i8 %y) {
	; CHECK-LABEL: @sub_ashr_or_i8(			; CHECK-LABEL: @sub_ashr_or_i8(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i8 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i8 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i8 [[SUB]], 7			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i8 -1, i8 [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i8 [[SHR]], [[X]]
	; CHECK-NEXT: ret i8 [[OR]]			; CHECK-NEXT: ret i8 [[OR]]
	;			;
	%sub = sub nsw i8 %y, %x			%sub = sub nsw i8 %y, %x
	%shr = ashr i8 %sub, 7			%shr = ashr i8 %sub, 7
	%or = or i8 %shr, %x			%or = or i8 %shr, %x
	ret i8 %or			ret i8 %or
	}			}

	define i16 @sub_ashr_or_i16(i16 %x, i16 %y) {			define i16 @sub_ashr_or_i16(i16 %x, i16 %y) {
	; CHECK-LABEL: @sub_ashr_or_i16(			; CHECK-LABEL: @sub_ashr_or_i16(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i16 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i16 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i16 [[SUB]], 15			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i16 -1, i16 [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i16 [[SHR]], [[X]]
	; CHECK-NEXT: ret i16 [[OR]]			; CHECK-NEXT: ret i16 [[OR]]
	;			;
	%sub = sub nsw i16 %y, %x			%sub = sub nsw i16 %y, %x
	%shr = ashr i16 %sub, 15			%shr = ashr i16 %sub, 15
	%or = or i16 %shr, %x			%or = or i16 %shr, %x
	ret i16 %or			ret i16 %or
	}			}

	define i32 @sub_ashr_or_i32(i32 %x, i32 %y) {			define i32 @sub_ashr_or_i32(i32 %x, i32 %y) {
	; CHECK-LABEL: @sub_ashr_or_i32(			; CHECK-LABEL: @sub_ashr_or_i32(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i32 -1, i32 [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[X]]
	; CHECK-NEXT: ret i32 [[OR]]			; CHECK-NEXT: ret i32 [[OR]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%or = or i32 %shr, %x			%or = or i32 %shr, %x
	ret i32 %or			ret i32 %or
	}			}

	define i64 @sub_ashr_or_i64(i64 %x, i64 %y) {			define i64 @sub_ashr_or_i64(i64 %x, i64 %y) {
	; CHECK-LABEL: @sub_ashr_or_i64(			; CHECK-LABEL: @sub_ashr_or_i64(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i64 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i64 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i64 [[SUB]], 63			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i64 -1, i64 [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i64 [[SHR]], [[X]]
	; CHECK-NEXT: ret i64 [[OR]]			; CHECK-NEXT: ret i64 [[OR]]
	;			;
	%sub = sub nsw i64 %y, %x			%sub = sub nsw i64 %y, %x
	%shr = ashr i64 %sub, 63			%shr = ashr i64 %sub, 63
	%or = or i64 %shr, %x			%or = or i64 %shr, %x
	ret i64 %or			ret i64 %or
	}			}

	; nuw nsw			; nuw nsw

	define i32 @sub_ashr_or_i32_nuw_nsw(i32 %x, i32 %y) {			define i32 @sub_ashr_or_i32_nuw_nsw(i32 %x, i32 %y) {
	; CHECK-LABEL: @sub_ashr_or_i32_nuw_nsw(			; CHECK-LABEL: @sub_ashr_or_i32_nuw_nsw(
	; CHECK-NEXT: [[SUB:%.]] = sub nuw nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i32 -1, i32 [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[X]]
	; CHECK-NEXT: ret i32 [[OR]]			; CHECK-NEXT: ret i32 [[OR]]
	;			;
	%sub = sub nuw nsw i32 %y, %x			%sub = sub nuw nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%or = or i32 %shr, %x			%or = or i32 %shr, %x
	ret i32 %or			ret i32 %or
	}			}

	; Commute			; Commute

	define i32 @sub_ashr_or_i32_commute(i32 %x, i32 %y) {			define i32 @sub_ashr_or_i32_commute(i32 %x, i32 %y) {
	; CHECK-LABEL: @sub_ashr_or_i32_commute(			; CHECK-LABEL: @sub_ashr_or_i32_commute(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i32 -1, i32 [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[X]]
	; CHECK-NEXT: ret i32 [[OR]]			; CHECK-NEXT: ret i32 [[OR]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%or = or i32 %x, %shr ; commute %shr and %x			%or = or i32 %x, %shr ; commute %shr and %x
	ret i32 %or			ret i32 %or
	}			}

	; Vector Types			; Vector Types

	define <4 x i32> @sub_ashr_or_i32_vec(<4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @sub_ashr_or_i32_vec(<4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @sub_ashr_or_i32_vec(			; CHECK-LABEL: @sub_ashr_or_i32_vec(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw <4 x i32> [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt <4 x i32> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr <4 x i32> [[SUB]], <i32 31, i32 31, i32 31, i32 31>			; CHECK-NEXT: [[OR:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or <4 x i32> [[SHR]], [[X]]
	; CHECK-NEXT: ret <4 x i32> [[OR]]			; CHECK-NEXT: ret <4 x i32> [[OR]]
	;			;
	%sub = sub nsw <4 x i32> %y, %x			%sub = sub nsw <4 x i32> %y, %x
	%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>			%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
	%or = or <4 x i32> %shr, %x			%or = or <4 x i32> %shr, %x
	ret <4 x i32> %or			ret <4 x i32> %or
	}			}

	define <4 x i32> @sub_ashr_or_i32_vec_nuw_nsw(<4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @sub_ashr_or_i32_vec_nuw_nsw(<4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @sub_ashr_or_i32_vec_nuw_nsw(			; CHECK-LABEL: @sub_ashr_or_i32_vec_nuw_nsw(
	; CHECK-NEXT: [[SUB:%.]] = sub nuw nsw <4 x i32> [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt <4 x i32> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr <4 x i32> [[SUB]], <i32 31, i32 31, i32 31, i32 31>			; CHECK-NEXT: [[OR:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or <4 x i32> [[SHR]], [[X]]
	; CHECK-NEXT: ret <4 x i32> [[OR]]			; CHECK-NEXT: ret <4 x i32> [[OR]]
	;			;
	%sub = sub nuw nsw <4 x i32> %y, %x			%sub = sub nuw nsw <4 x i32> %y, %x
	%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>			%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
	%or = or <4 x i32> %shr, %x			%or = or <4 x i32> %shr, %x
	ret <4 x i32> %or			ret <4 x i32> %or
	}			}

	define <4 x i32> @sub_ashr_or_i32_vec_commute(<4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @sub_ashr_or_i32_vec_commute(<4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @sub_ashr_or_i32_vec_commute(			; CHECK-LABEL: @sub_ashr_or_i32_vec_commute(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw <4 x i32> [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt <4 x i32> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr <4 x i32> [[SUB]], <i32 31, i32 31, i32 31, i32 31>			; CHECK-NEXT: [[OR:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or <4 x i32> [[SHR]], [[X]]
	; CHECK-NEXT: ret <4 x i32> [[OR]]			; CHECK-NEXT: ret <4 x i32> [[OR]]
	;			;
	%sub = sub nsw <4 x i32> %y, %x			%sub = sub nsw <4 x i32> %y, %x
	%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>			%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
	%or = or <4 x i32> %x, %shr			%or = or <4 x i32> %x, %shr
	ret <4 x i32> %or			ret <4 x i32> %or
	}			}

	; Extra uses			; Extra uses

	define i32 @sub_ashr_or_i32_extra_use_sub(i32 %x, i32 %y, i32* %p) {			define i32 @sub_ashr_or_i32_extra_use_sub(i32 %x, i32 %y, i32* %p) {
	; CHECK-LABEL: @sub_ashr_or_i32_extra_use_sub(			; CHECK-LABEL: @sub_ashr_or_i32_extra_use_sub(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]
	; CHECK-NEXT: store i32 [[SUB]], i32* [[P:%.*]], align 4			; CHECK-NEXT: store i32 [[SUB]], i32* [[P:%.*]], align 4
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[Y]], [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[X]]			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i32 -1, i32 [[X]]
	; CHECK-NEXT: ret i32 [[OR]]			; CHECK-NEXT: ret i32 [[OR]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	store i32 %sub, i32* %p			store i32 %sub, i32* %p
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%or = or i32 %shr, %x			%or = or i32 %shr, %x
	ret i32 %or			ret i32 %or
	}			}

	define i32 @sub_ashr_or_i32_extra_use_or(i32 %x, i32 %y, i32* %p) {			define i32 @sub_ashr_or_i32_extra_use_or(i32 %x, i32 %y, i32* %p) {
	; CHECK-LABEL: @sub_ashr_or_i32_extra_use_or(			; CHECK-LABEL: @sub_ashr_or_i32_extra_use_or(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[OR:%.*]] = select i1 [[TMP1]], i32 -1, i32 [[X]]
	; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[X]]
	; CHECK-NEXT: store i32 [[OR]], i32* [[P:%.*]], align 4			; CHECK-NEXT: store i32 [[OR]], i32* [[P:%.*]], align 4
	; CHECK-NEXT: ret i32 [[OR]]			; CHECK-NEXT: ret i32 [[OR]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%or = or i32 %shr, %x			%or = or i32 %shr, %x
	store i32 %or, i32* %p			store i32 %or, i32* %p
	ret i32 %or			ret i32 %or
	▲ Show 20 Lines • Show All 70 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] Fold a shifty implementation of clamp-to-allonesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 221446

llvm/trunk/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

llvm/trunk/test/Transforms/InstCombine/sub-ashr-or-to-icmp-select.ll

[InstCombine] Fold a shifty implementation of clamp-to-allones
ClosedPublic