This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
4/8
InstCombineAndOrXor.cpp
-
test/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
-
sub-ashr-and-to-icmp-select.ll

Differential D67799

[InstCombine] Fold a shifty implementation of clamp-to-zero.
ClosedPublic

Authored by huihuiz on Sep 19 2019, 11:28 PM.

Download Raw Diff

Details

Reviewers

lebedev.ri
efriedma
spatel
kparzysz
bcahoon

Commits

rG895219971592: [InstCombine] Fold a shifty implementation of clamp-to-zero.
rL372676: [InstCombine] Fold a shifty implementation of clamp-to-zero.

Summary

Fold

and(ashr(subNSW(X, V), ScalarSizeInBits - 1), V)

into

V s> X ? V : 0

https://rise4fun.com/Alive/0Mi

Fold shift into select enable more optimization, e.g., vmax generation for ARM target.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

huihuiz created this revision.Sep 19 2019, 11:28 PM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald TranscriptSep 19 2019, 11:28 PM

E.g., vmax generation for ARM target

test.c

static __inline int clamp0(int v) {
  return ((-(v) >> 31) & (v));
}

void foo(const unsigned char* src0,
         const unsigned char* src1,
         unsigned char* dst,
                       int width) {
  int i;
  for (i = 0; i < width; ++i) {
    const int b = src0[0];
    const int b_sub = src1[0];
    dst[0] = clamp0(b - b_sub);
    src0 ++;
    src1 ++;
    dst ++;
  }
}

run : clang -cc1 -triple armv8.1a-linux-gnu -target-abi apcs-gnu -target-feature +neon -vectorize-loops -vectorize-slp -O2 -S -o - test-clamp0.c -o -
you can see "vmax" optimization

before this optimization, generate "vneg + vshr + vand" instead.

huihuiz added a child revision: D67798: [NFC][InstCombine] Add tests for shifty implementation of image clamping..Sep 19 2019, 11:33 PM

lebedev.ri added inline comments.Sep 20 2019, 1:28 AM

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
1933–1936	Only `m_AShr` has to be one-use This doesn't actually deal with commutativity correctly You want match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Zero(), m_Specific(V)), m_APInt(ShAmt))), m_Value(V)))

lebedev.ri added inline comments.Sep 20 2019, 1:41 AM

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
1940	Hmm, super random thought. @spatel we convert code that was written without a branch, likely very intentionally, into a possibly-branch code. Should we not add `unpredictable` to this new `switch`? I think it's almost correctness question..

lebedev.ri added inline comments.Sep 20 2019, 3:09 AM

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
1933–1936	Err, match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Zero(), m_Specific(V)), m_APInt(ShAmt))), m_Deferred(V))) of course
1940	s/switch/select/

We need to confirm that the backend produces better asm for at least a few in-tree targets before/after this transform. Please attach output for x86 and AArch64. We'll want to have examples for scalar and vector code, so you probably need to suppress the vectorizers.

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
1940	It's not a matter of correctness, but I agree that we do not want to end up with branchy code when the source used tricky bit-hacks almost certainly to avoid branching. But adding 'unpredictable' is not a solution here AFAICT because we're not creating a branch or switch. We could add explicit profile metadata to the select to indicate the compare is 50/50, but that doesn't necessarily imply unpredictable.

For X86, AArch64 and ARM target, backend produce better ASM with this transformation. Please refer to below examples:

Scalar Test ---

X86 target:

Test input; Run command : clang -O2 -target x86_64 -march=skylake -S clamp0.ll -o -

define i32 @clamp0(i32 %v) {
  %sub = sub nsw i32 0, %v
  %shr = ashr i32 %sub, 31
  %and = and i32 %shr, %v
  ret i32 %and
}

before

clamp0:                                 # @clamp0
# %bb.0:
        movl    %edi, %eax
        negl    %eax
        sarl    $31, %eax
        andl    %edi, %eax
        retq

After this optimization

clamp0:                                 # @clamp0
# %bb.0:
        movl    %edi, %eax
        sarl    $31, %eax
        andnl   %edi, %eax, %eax
        retq

AArch64 target:
Same test input; Run command : clang -O2 -target aarch64 -march=armv8a -S clamp0.ll -o -

before

clamp0:                                 // @clamp0
// %bb.0:
        neg     w8, w0
        and     w0, w0, w8, asr #31
        ret

After this optimization

clamp0:                                 // @clamp0
// %bb.0:
        bic     w0, w0, w0, asr #31
        ret

ARM target:
Same input; run : clang -O2 -target arm -march=armv8.1a -S clamp0.ll -o -
before:

clamp0:
        .fnstart
@ %bb.0:
        rsb     r1, r0, #0
        and     r0, r0, r1, asr #31
        bx      lr

After this optimization

clamp0:
        .fnstart
@ %bb.0:
        bic     r0, r0, r0, asr #31
        bx      lr

Vector Test ---

X86 target:
Test input; Run command : clang -O2 -target x86_64 -march=skylake -S clamp0-vec.ll -o -

define <4 x i32> @clamp0-vec(<4 x i32> %v) {
  %sub = sub nsw <4 x i32> zeroinitializer, %v
  %shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
  %and = and <4 x i32> %shr, %v
  ret <4 x i32> %and
}

before

"clamp0-vec":                           # @clamp0-vec
# %bb.0:
        vpxor   %xmm1, %xmm1, %xmm1
        vpsubd  %xmm0, %xmm1, %xmm1
        vpsrad  $31, %xmm1, %xmm1
        vpand   %xmm0, %xmm1, %xmm0
        retq

After this optimization

"clamp0-vec":                           # @clamp0-vec
# %bb.0:
        vpxor   %xmm1, %xmm1, %xmm1
        vpmaxsd %xmm1, %xmm0, %xmm0
        retq

AArch64 target:
Same test input; Run : clang -O2 -target aarch64 -march=armv8a -S clamp0-vec.ll -o -
before

"clamp0-vec":                           // @clamp0-vec
// %bb.0:
        neg     v1.4s, v0.4s
        sshr    v1.4s, v1.4s, #31
        and     v0.16b, v1.16b, v0.16b
        ret

After this optimization

"clamp0-vec":                           // @clamp0-vec
// %bb.0:
        movi    v1.2d, #0000000000000000
        smax    v0.4s, v0.4s, v1.4s
        ret

ARM target
Same input; Run : clang -target arm-arm-none-eabi -mcpu=cortex-a57 -mfpu=neon-fp-armv8 -O2 -S clamp0-vec.ll -o -
before

"clamp0-vec":
        .fnstart
        vmov    d17, r2, r3
        vmov    d16, r0, r1
        vneg.s32        q9, q8
        vshr.s32        q9, q9, #31
        vand    q8, q9, q8
        vmov    r0, r1, d16
        vmov    r2, r3, d17
        bx      lr

After this optimization

"clamp0-vec":
        .fnstart
        vmov    d17, r2, r3
        vmov    d16, r0, r1
        vmov.i32        q9, #0x0
        vmax.s32        q8, q8, q9
        vmov    r0, r1, d16
        vmov    r2, r3, d17
        bx      lr

huihuiz mentioned this in D67800: [InstCombine] Fold a shifty implementation of clamp-to-allones.Sep 20 2019, 5:56 PM

resolved reviews feedback

Please change clamp0 everywhere to clamp negative to zero, it wasn't obvious to what clamp0 means until reading all of the patch.
This looks ok otherwise. Please wait for @spatel to comment.

In D67799#1677606, @huihuiz wrote:

For X86, AArch64 and ARM target, backend produce better ASM with this transformation. Please refer to below examples:

I'd agree. @spatel ?

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

1927–1928

maybe just

// and(ashr(subNSW(0, V), ScalarSizeInBits -1), V) --> V s< 0 ? 0 : V

1937–1939

Let's emit what we get in the tests?

Value *NewICmpInst =
    Builder.CreateICmpSGT(V, ConstantInt::getNullValue(Ty));
return SelectInst::Create(NewICmpInst, V, ConstantInt::getNullValue(Ty));

This revision is now accepted and ready to land.Sep 21 2019, 2:40 AM

Ah, finally got it. There is more general fold here:

Name: sub_ashr_and_nsw
  %sub = sub nsw i8 %X, %v
  %ashr = ashr i8 %sub, 7
  %r = and i8 %ashr, %v
=>
  %cmp = icmp sle i8 %v, %X
  %r = select i1 %cmp, i8 0, i8 %v

https://rise4fun.com/Alive/urO

This revision now requires changes to proceed.Sep 21 2019, 8:02 AM

In D67799#1677779, @lebedev.ri wrote:

Please change clamp0 everywhere to clamp negative to zero, it wasn't obvious to what clamp0 means until reading all of the patch.
This looks ok otherwise. Please wait for @spatel to comment.

In D67799#1677606, @huihuiz wrote:

For X86, AArch64 and ARM target, backend produce better ASM with this transformation. Please refer to below examples:

I'd agree. @spatel ?

Yes, all asm diffs look good to me. DAGCombiner knows how to convert a select with '0' false operand into something better ('max' or 'and not' instructions). I'm not sure if that will be true for the more general fold though, so more testing will be needed for those patterns.

make folding more general

llvm-mca results for more general folding pattern

Scalar Tests ---

X86: skylake cmovgl latency 1

test input; run : clang clampNegToZero.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake

define i32 @clamp0(i32 %v, i32 %x) {
  %sub = sub nsw i32 %x, %v
  %shr = ashr i32 %sub, 31
  %and = and i32 %shr, %v
  ret i32 %and
}

Before:

Iterations:        100
Instructions:      500
Total Cycles:      159
Total uOps:        700

Dispatch Width:    6
uOps Per Cycle:    4.40
IPC:               3.14
Block RThroughput: 1.2


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movl  %esi, %eax
 1      1     0.25                        subl  %edi, %eax
 1      1     0.50                        sarl  $31, %eax
 1      1     0.25                        andl  %edi, %eax
 3      7     1.00                  U     retq

After this transformation:

Iterations:        100
Instructions:      400
Total Cycles:      110
Total uOps:        600

Dispatch Width:    6
uOps Per Cycle:    5.45
IPC:               3.64
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      0     0.17                        xorl  %eax, %eax
 1      1     0.25                        cmpl  %esi, %edi
 1      1     0.50                        cmovgl        %edi, %eax
 3      7     1.00                  U     retq

X86: cooper lake cmovgl latency also 1

same input; run: clang clampNegToZero.ll -O2 -target x86_64 -march=cooperlake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=cooperlake

before

Iterations:        100
Instructions:      500
Total Cycles:      159
Total uOps:        700

Dispatch Width:    6
uOps Per Cycle:    4.40
IPC:               3.14
Block RThroughput: 1.2


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movl  %esi, %eax
 1      1     0.25                        subl  %edi, %eax
 1      1     0.50                        sarl  $31, %eax
 1      1     0.25                        andl  %edi, %eax
 3      7     1.00                  U     retq

After this transformation:

Iterations:        100
Instructions:      400
Total Cycles:      110
Total uOps:        600

Dispatch Width:    6
uOps Per Cycle:    5.45
IPC:               3.64
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      0     0.17                        xorl  %eax, %eax
 1      1     0.25                        cmpl  %esi, %edi
 1      1     0.50                        cmovgl        %edi, %eax
 3      7     1.00                  U     retq

AMD :
same input; run: clang clampNegToZero.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2

Before

Iterations:        100
Instructions:      500
Total Cycles:      155
Total uOps:        600

Dispatch Width:    4
uOps Per Cycle:    3.87
IPC:               3.23
Block RThroughput: 1.5


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movl  %esi, %eax
 1      1     0.25                        subl  %edi, %eax
 1      1     0.25                        sarl  $31, %eax
 1      1     0.25                        andl  %edi, %eax
 2      1     0.50                  U     retq

After this transformation:

Iterations:        100
Instructions:      400
Total Cycles:      203
Total uOps:        500

Dispatch Width:    4
uOps Per Cycle:    2.46
IPC:               1.97
Block RThroughput: 1.3


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        xorl  %eax, %eax
 1      1     0.25                        cmpl  %esi, %edi
 1      1     0.25                        cmovgl        %edi, %eax
 2      1     0.50                  U     retq

AArch64: cortex-a57 csel latency 1
run: clang clampNegToZero.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57
before:

Iterations:        100
Instructions:      300
Total Cycles:      303
Total uOps:        300

Dispatch Width:    3
uOps Per Cycle:    0.99
IPC:               0.99
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        sub   w8, w1, w0
 1      2     1.00                        and   w0, w0, w8, asr #31
 1      1     1.00                  U     ret

After this transformation:

Iterations:        100
Instructions:      300
Total Cycles:      203
Total uOps:        300

Dispatch Width:    3
uOps Per Cycle:    1.48
IPC:               1.48
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        cmp   w0, w1
 1      1     0.50                        csel  w0, w0, wzr, gt
 1      1     1.00                  U     ret

Vector Tests ---

test input

define <4 x i32> @clamp0-vec(<4 x i32> %v, <4 x i32> %x) {
  %sub = sub nsw <4 x i32> %x, %v
  %shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
  %and = and <4 x i32> %shr, %v
  ret <4 x i32> %and
}

X86 : skylake
clang clampNegToZero-vec.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake

before

Iterations:        100
Instructions:      400
Total Cycles:      303
Total uOps:        600

Dispatch Width:    6
uOps Per Cycle:    1.98
IPC:               1.32
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.33                        vpsubd        %xmm0, %xmm1, %xmm1
 1      1     0.50                        vpsrad        $31, %xmm1, %xmm1
 1      1     0.33                        vpand %xmm0, %xmm1, %xmm0
 3      7     1.00                  U     retq

After this transformation

Iterations:        100
Instructions:      300
Total Cycles:      203
Total uOps:        500

Dispatch Width:    6
uOps Per Cycle:    2.46
IPC:               1.48
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        vpcmpgtd      %xmm1, %xmm0, %xmm1
 1      1     0.33                        vpand %xmm0, %xmm1, %xmm0
 3      7     1.00                  U     retq

AMD znver2
clang clampNegToZero-vec.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2

before

Iterations:        100
Instructions:      400
Total Cycles:      303
Total uOps:        500

Dispatch Width:    4
uOps Per Cycle:    1.65
IPC:               1.32
Block RThroughput: 1.3


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        vpsubd        %xmm0, %xmm1, %xmm1
 1      1     0.25                        vpsrad        $31, %xmm1, %xmm1
 1      1     0.25                        vpand %xmm0, %xmm1, %xmm0
 2      1     0.50                  U     retq

After this transformation

Iterations:        100
Instructions:      300
Total Cycles:      203
Total uOps:        400

Dispatch Width:    4
uOps Per Cycle:    1.97
IPC:               1.48
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        vpcmpgtd      %xmm1, %xmm0, %xmm1
 1      1     0.25                        vpand %xmm0, %xmm1, %xmm0
 2      1     0.50                  U     retq

AArch64 cortex-a57
clang clampNegToZero-vec.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57

before

Iterations:        100
Instructions:      400
Total Cycles:      903
Total uOps:        400

Dispatch Width:    3
uOps Per Cycle:    0.44
IPC:               0.44
Block RThroughput: 1.5


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      3     0.50                        sub   v1.4s, v1.4s, v0.4s
 1      3     0.50                        sshr  v1.4s, v1.4s, #31
 1      3     0.50                        and   v0.16b, v1.16b, v0.16b
 1      1     1.00                  U     ret

After this transformation

Iterations:        100
Instructions:      300
Total Cycles:      603
Total uOps:        300

Dispatch Width:    3
uOps Per Cycle:    0.50
IPC:               0.50
Block RThroughput: 1.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      3     0.50                        cmgt  v1.4s, v0.4s, v1.4s
 1      3     0.50                        and   v0.16b, v0.16b, v1.16b
 1      1     1.00                  U     ret

Another note, for older generation X86 target, e.g., haswell, cmove indeed has latency 2. But able to achieve comparable uOps Per Cycle
same test input
clang clampNegToZero.ll -O2 -target x86_64 -march=haswell -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell
before

Iterations:        100
Instructions:      500
Total Cycles:      210
Total uOps:        700

Dispatch Width:    4
uOps Per Cycle:    3.33
IPC:               2.38
Block RThroughput: 1.8


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movl  %esi, %eax
 1      1     0.25                        subl  %edi, %eax
 1      1     0.50                        sarl  $31, %eax
 1      1     0.25                        andl  %edi, %eax
 3      7     1.00                  U     retq

After

Iterations:        100
Instructions:      400
Total Cycles:      209
Total uOps:        700

Dispatch Width:    4
uOps Per Cycle:    3.35
IPC:               1.91
Block RThroughput: 1.8


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      0     0.25                        xorl  %eax, %eax
 1      1     0.25                        cmpl  %esi, %edi
 2      2     0.50                        cmovgl        %edi, %eax
 3      7     1.00                  U     retq

lebedev.ri retitled this revision from [InstCombine] Fold a shifty implementation of clamp negative to zero. to [InstCombine] Fold a shifty implementation of clamp-to-zero..Sep 23 2019, 6:23 AM

lebedev.ri edited the summary of this revision. (Show Details)

Thanks, looks good to me.
-march=znver2 numbers somewhat both regress and improve, but there is no znver2 sched model in llvm trunk, so that is some default sched model.

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
1927–1929	Super pedantic: can we streamline the variables here? How about // and(ashr(subNSW(Y, X), (ScalarSizeInBits(Y)-1)), X) --> X s> Y ? X : 0.

This revision is now accepted and ready to land.Sep 23 2019, 6:29 AM

cc'ing @craig.topper @RKSimon @andreadb in case the use of x86 'cmov' has any pitfalls that we're not seeing so far.

In D67799#1678985, @lebedev.ri wrote:

Thanks, looks good to me.
-march=znver2 numbers somewhat both regress and improve, but there is no znver2 sched model in llvm trunk, so that is some default sched model.

There is WIP patch https://reviews.llvm.org/D66088

This is just FYI.

llvm-mca result for AMD btver2 and bdver2

AMD btver2
clang clampNegToZero.ll -O2 -target x86_64 -march=btver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2
before

Iterations:        100
Instructions:      500
Total Cycles:      256
Total uOps:        500

Dispatch Width:    2
uOps Per Cycle:    1.95
IPC:               1.95
Block RThroughput: 2.5


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        movl  %esi, %eax
 1      1     0.50                        subl  %edi, %eax
 1      1     0.50                        sarl  $31, %eax
 1      1     0.50                        andl  %edi, %eax
 1      4     1.00                  U     retq

After

Iterations:        100
Instructions:      400
Total Cycles:      206
Total uOps:        400

Dispatch Width:    2
uOps Per Cycle:    1.94
IPC:               1.94
Block RThroughput: 2.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      0     0.50                        xorl  %eax, %eax
 1      1     0.50                        cmpl  %esi, %edi
 1      1     0.50                        cmovgl        %edi, %eax
 1      4     1.00                  U     retq

AMD bdver2
clang clampNegToZero.ll -O2 -target x86_64 -march=bdver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2
before

Iterations:        100
Instructions:      500
Total Cycles:      455
Total uOps:        500

Dispatch Width:    4
uOps Per Cycle:    1.10
IPC:               1.10
Block RThroughput: 4.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     1.00                        movl  %esi, %eax
 1      1     1.00                        subl  %edi, %eax
 1      1     1.00                        sarl  $31, %eax
 1      1     1.00                        andl  %edi, %eax
 1      5     1.50                  U     retq

After

Iterations:        100
Instructions:      400
Total Cycles:      208
Total uOps:        400

Dispatch Width:    4
uOps Per Cycle:    1.92
IPC:               1.92
Block RThroughput: 1.5


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      0     0.25                        xorl  %eax, %eax
 1      1     1.00                        cmpl  %esi, %edi
 1      1     0.50                        cmovgl        %edi, %eax
 1      5     1.50                  U     retq

Closed by commit rG895219971592: [InstCombine] Fold a shifty implementation of clamp-to-zero. (authored by huihuiz). · Explain WhySep 23 2019, 5:19 PM

This revision was automatically updated to reflect the committed changes.

spatel mentioned this in D111410: [InstCombine] generalize fold for mask-with-signbit-splat.Oct 8 2021, 7:44 AM

spatel mentioned this in rG727e642e970d: [InstCombine] generalize fold for mask-with-signbit-splat.Oct 15 2021, 1:31 PM

Revision Contents

Path

Size

llvm/

lib/

Transforms/

InstCombine/

InstCombineAndOrXor.cpp

14 lines

test/

Transforms/

InstCombine/

sub-ashr-and-to-icmp-select.ll

54 lines

Diff 221442

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

Show First 20 Lines • Show All 1,918 Lines • ▼ Show 20 Lines	Instruction *InstCombiner::visitAnd(BinaryOperator &I) {

// and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>.		// and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>.
Value *A;		Value *A;
if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&		if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
A->getType()->isIntOrIntVectorTy(1))		A->getType()->isIntOrIntVectorTy(1))
return SelectInst::Create(A, Op1, Constant::getNullValue(I.getType()));		return SelectInst::Create(A, Op1, Constant::getNullValue(I.getType()));
if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&		if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
A->getType()->isIntOrIntVectorTy(1))		A->getType()->isIntOrIntVectorTy(1))
return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType()));		return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType()));

		lebedev.riUnsubmitted Done Reply Inline Actions maybe just // and(ashr(subNSW(0, V), ScalarSizeInBits -1), V) --> V s< 0 ? 0 : V ? lebedev.ri: maybe just ``` // and(ashr(subNSW(0, V), ScalarSizeInBits -1), V) --> V s< 0 ? 0 : V ``` ?
		// and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
		lebedev.riUnsubmitted Not Done Reply Inline Actions Super pedantic: can we streamline the variables here? How about // and(ashr(subNSW(Y, X), (ScalarSizeInBits(Y)-1)), X) --> X s> Y ? X : 0. lebedev.ri: Super pedantic: can we streamline the variables here? How about ``` // and(ashr(subNSW(Y, X)…
		{
		Value X, Y;
		const APInt *ShAmt;
		Type *Ty = I.getType();
		if (match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
		m_APInt(ShAmt))),
		m_Deferred(X))) &&
		lebedev.riUnsubmitted Done Reply Inline Actions Only `m_AShr` has to be one-use This doesn't actually deal with commutativity correctly You want match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Zero(), m_Specific(V)), m_APInt(ShAmt))), m_Value(V))) lebedev.ri: 1. Only `m_AShr` has to be one-use 2. This doesn't actually deal with commutativity correctly…
		lebedev.riUnsubmitted Done Reply Inline Actions Err, match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Zero(), m_Specific(V)), m_APInt(ShAmt))), m_Deferred(V))) of course lebedev.ri: Err, ``` match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Zero()…
		*ShAmt == Ty->getScalarSizeInBits() - 1) {
		Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
		return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
		lebedev.riUnsubmitted Done Reply Inline Actions Let's emit what we get in the tests? Value NewICmpInst = Builder.CreateICmpSGT(V, ConstantInt::getNullValue(Ty)); return SelectInst::Create(NewICmpInst, V, ConstantInt::getNullValue(Ty)); lebedev.ri:* Let's emit what we get in the tests? ``` Value *NewICmpInst = Builder.
		}
		lebedev.riUnsubmitted Not Done Reply Inline Actions Hmm, super random thought. @spatel we convert code that was written without a branch, likely very intentionally, into a possibly-branch code. Should we not add `unpredictable` to this new `switch`? I think it's almost correctness question.. lebedev.ri: Hmm, super random thought. @spatel we convert code that was written without a branch, likely…
		lebedev.riUnsubmitted Not Done Reply Inline Actions s/switch/select/ lebedev.ri: s/switch/select/
		spatelUnsubmitted Not Done Reply Inline Actions It's not a matter of correctness, but I agree that we do not want to end up with branchy code when the source used tricky bit-hacks almost certainly to avoid branching. But adding 'unpredictable' is not a solution here AFAICT because we're not creating a branch or switch. We could add explicit profile metadata to the select to indicate the compare is 50/50, but that doesn't necessarily imply unpredictable. spatel: It's not a matter of correctness, but I agree that we do not want to end up with branchy code…
		}

return nullptr;		return nullptr;
}		}

Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {		Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");		assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
Value Op0 = Or.getOperand(0), Op1 = Or.getOperand(1);		Value Op0 = Or.getOperand(0), Op1 = Or.getOperand(1);

// Look through zero extends.		// Look through zero extends.
▲ Show 20 Lines • Show All 1,286 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/sub-ashr-and-to-icmp-select.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -instcombine %s -S -o - \| FileCheck %s			; RUN: opt -instcombine %s -S -o - \| FileCheck %s

	; Clamp negative to zero:			; Clamp negative to zero:
	; E.g., clamp0 implemented in a shifty way, could be optimized as v > 0 ? v : 0, where sub hasNoSignedWrap.			; E.g., clamp0 implemented in a shifty way, could be optimized as v > 0 ? v : 0, where sub hasNoSignedWrap.
	; int32 clamp0(int32 v) {			; int32 clamp0(int32 v) {
	; return ((-(v) >> 31) & (v));			; return ((-(v) >> 31) & (v));
	; }			; }
	;			;

	; Scalar Types			; Scalar Types

	define i8 @sub_ashr_and_i8(i8 %x, i8 %y) {			define i8 @sub_ashr_and_i8(i8 %x, i8 %y) {
	; CHECK-LABEL: @sub_ashr_and_i8(			; CHECK-LABEL: @sub_ashr_and_i8(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i8 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i8 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i8 [[SUB]], 7			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i8 [[X]], i8 0
	; CHECK-NEXT: [[AND:%.*]] = and i8 [[SHR]], [[X]]
	; CHECK-NEXT: ret i8 [[AND]]			; CHECK-NEXT: ret i8 [[AND]]
	;			;
	%sub = sub nsw i8 %y, %x			%sub = sub nsw i8 %y, %x
	%shr = ashr i8 %sub, 7			%shr = ashr i8 %sub, 7
	%and = and i8 %shr, %x			%and = and i8 %shr, %x
	ret i8 %and			ret i8 %and
	}			}

	define i16 @sub_ashr_and_i16(i16 %x, i16 %y) {			define i16 @sub_ashr_and_i16(i16 %x, i16 %y) {
	; CHECK-LABEL: @sub_ashr_and_i16(			; CHECK-LABEL: @sub_ashr_and_i16(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i16 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i16 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i16 [[SUB]], 15			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i16 [[X]], i16 0
	; CHECK-NEXT: [[AND:%.*]] = and i16 [[SHR]], [[X]]
	; CHECK-NEXT: ret i16 [[AND]]			; CHECK-NEXT: ret i16 [[AND]]
	;			;

	%sub = sub nsw i16 %y, %x			%sub = sub nsw i16 %y, %x
	%shr = ashr i16 %sub, 15			%shr = ashr i16 %sub, 15
	%and = and i16 %shr, %x			%and = and i16 %shr, %x
	ret i16 %and			ret i16 %and
	}			}

	define i32 @sub_ashr_and_i32(i32 %x, i32 %y) {			define i32 @sub_ashr_and_i32(i32 %x, i32 %y) {
	; CHECK-LABEL: @sub_ashr_and_i32(			; CHECK-LABEL: @sub_ashr_and_i32(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 0
	; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], [[X]]
	; CHECK-NEXT: ret i32 [[AND]]			; CHECK-NEXT: ret i32 [[AND]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%and = and i32 %shr, %x			%and = and i32 %shr, %x
	ret i32 %and			ret i32 %and
	}			}

	define i64 @sub_ashr_and_i64(i64 %x, i64 %y) {			define i64 @sub_ashr_and_i64(i64 %x, i64 %y) {
	; CHECK-LABEL: @sub_ashr_and_i64(			; CHECK-LABEL: @sub_ashr_and_i64(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i64 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i64 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i64 [[SUB]], 63			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i64 [[X]], i64 0
	; CHECK-NEXT: [[AND:%.*]] = and i64 [[SHR]], [[X]]
	; CHECK-NEXT: ret i64 [[AND]]			; CHECK-NEXT: ret i64 [[AND]]
	;			;
	%sub = sub nsw i64 %y, %x			%sub = sub nsw i64 %y, %x
	%shr = ashr i64 %sub, 63			%shr = ashr i64 %sub, 63
	%and = and i64 %shr, %x			%and = and i64 %shr, %x
	ret i64 %and			ret i64 %and
	}			}

	; nuw nsw			; nuw nsw

	define i32 @sub_ashr_and_i32_nuw_nsw(i32 %x, i32 %y) {			define i32 @sub_ashr_and_i32_nuw_nsw(i32 %x, i32 %y) {
	; CHECK-LABEL: @sub_ashr_and_i32_nuw_nsw(			; CHECK-LABEL: @sub_ashr_and_i32_nuw_nsw(
	; CHECK-NEXT: [[SUB:%.]] = sub nuw nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 0
	; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], [[X]]
	; CHECK-NEXT: ret i32 [[AND]]			; CHECK-NEXT: ret i32 [[AND]]
	;			;
	%sub = sub nuw nsw i32 %y, %x			%sub = sub nuw nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%and = and i32 %shr, %x			%and = and i32 %shr, %x
	ret i32 %and			ret i32 %and
	}			}

	; Commute			; Commute

	define i32 @sub_ashr_and_i32_commute(i32 %x, i32 %y) {			define i32 @sub_ashr_and_i32_commute(i32 %x, i32 %y) {
	; CHECK-LABEL: @sub_ashr_and_i32_commute(			; CHECK-LABEL: @sub_ashr_and_i32_commute(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 0
	; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], [[X]]
	; CHECK-NEXT: ret i32 [[AND]]			; CHECK-NEXT: ret i32 [[AND]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%and = and i32 %x, %shr ; commute %x and %shr			%and = and i32 %x, %shr ; commute %x and %shr
	ret i32 %and			ret i32 %and
	}			}

	; Vector Types			; Vector Types

	define <4 x i32> @sub_ashr_and_i32_vec(<4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @sub_ashr_and_i32_vec(<4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @sub_ashr_and_i32_vec(			; CHECK-LABEL: @sub_ashr_and_i32_vec(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw <4 x i32> [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt <4 x i32> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr <4 x i32> [[SUB]], <i32 31, i32 31, i32 31, i32 31>			; CHECK-NEXT: [[AND:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[X]], <4 x i32> zeroinitializer
	; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[SHR]], [[X]]
	; CHECK-NEXT: ret <4 x i32> [[AND]]			; CHECK-NEXT: ret <4 x i32> [[AND]]
	;			;
	%sub = sub nsw <4 x i32> %y, %x			%sub = sub nsw <4 x i32> %y, %x
	%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>			%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
	%and = and <4 x i32> %shr, %x			%and = and <4 x i32> %shr, %x
	ret <4 x i32> %and			ret <4 x i32> %and
	}			}

	define <4 x i32> @sub_ashr_and_i32_vec_nuw_nsw(<4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @sub_ashr_and_i32_vec_nuw_nsw(<4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @sub_ashr_and_i32_vec_nuw_nsw(			; CHECK-LABEL: @sub_ashr_and_i32_vec_nuw_nsw(
	; CHECK-NEXT: [[SUB:%.]] = sub nuw nsw <4 x i32> [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt <4 x i32> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr <4 x i32> [[SUB]], <i32 31, i32 31, i32 31, i32 31>			; CHECK-NEXT: [[AND:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[X]], <4 x i32> zeroinitializer
	; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[SHR]], [[X]]
	; CHECK-NEXT: ret <4 x i32> [[AND]]			; CHECK-NEXT: ret <4 x i32> [[AND]]
	;			;
	%sub = sub nuw nsw <4 x i32> %y, %x			%sub = sub nuw nsw <4 x i32> %y, %x
	%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>			%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
	%and = and <4 x i32> %shr, %x			%and = and <4 x i32> %shr, %x
	ret <4 x i32> %and			ret <4 x i32> %and
	}			}

	define <4 x i32> @sub_ashr_and_i32_vec_commute(<4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @sub_ashr_and_i32_vec_commute(<4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @sub_ashr_and_i32_vec_commute(			; CHECK-LABEL: @sub_ashr_and_i32_vec_commute(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw <4 x i32> [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt <4 x i32> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr <4 x i32> [[SUB]], <i32 31, i32 31, i32 31, i32 31>			; CHECK-NEXT: [[AND:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[X]], <4 x i32> zeroinitializer
	; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[SHR]], [[X]]
	; CHECK-NEXT: ret <4 x i32> [[AND]]			; CHECK-NEXT: ret <4 x i32> [[AND]]
	;			;
	%sub = sub nsw <4 x i32> %y, %x			%sub = sub nsw <4 x i32> %y, %x
	%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>			%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
	%and = and <4 x i32> %x, %shr ; commute %x and %shr			%and = and <4 x i32> %x, %shr ; commute %x and %shr
	ret <4 x i32> %and			ret <4 x i32> %and
	}			}

	; Extra uses			; Extra uses

	define i32 @sub_ashr_and_i32_extra_use_sub(i32 %x, i32 %y, i32* %p) {			define i32 @sub_ashr_and_i32_extra_use_sub(i32 %x, i32 %y, i32* %p) {
	; CHECK-LABEL: @sub_ashr_and_i32_extra_use_sub(			; CHECK-LABEL: @sub_ashr_and_i32_extra_use_sub(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]
	; CHECK-NEXT: store i32 [[SUB]], i32* [[P:%.*]], align 4			; CHECK-NEXT: store i32 [[SUB]], i32* [[P:%.*]], align 4
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[Y]], [[X]]
	; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], [[X]]			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 0
	; CHECK-NEXT: ret i32 [[AND]]			; CHECK-NEXT: ret i32 [[AND]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	store i32 %sub, i32* %p			store i32 %sub, i32* %p
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%and = and i32 %shr, %x			%and = and i32 %shr, %x
	ret i32 %and			ret i32 %and
	}			}

	define i32 @sub_ashr_and_i32_extra_use_and(i32 %x, i32 %y, i32* %p) {			define i32 @sub_ashr_and_i32_extra_use_and(i32 %x, i32 %y, i32* %p) {
	; CHECK-LABEL: @sub_ashr_and_i32_extra_use_and(			; CHECK-LABEL: @sub_ashr_and_i32_extra_use_and(
	; CHECK-NEXT: [[SUB:%.]] = sub nsw i32 [[Y:%.]], [[X:%.*]]			; CHECK-NEXT: [[TMP1:%.]] = icmp sgt i32 [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[SUB]], 31			; CHECK-NEXT: [[AND:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 0
	; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], [[X]]
	; CHECK-NEXT: store i32 [[AND]], i32* [[P:%.*]], align 4			; CHECK-NEXT: store i32 [[AND]], i32* [[P:%.*]], align 4
	; CHECK-NEXT: ret i32 [[AND]]			; CHECK-NEXT: ret i32 [[AND]]
	;			;
	%sub = sub nsw i32 %y, %x			%sub = sub nsw i32 %y, %x
	%shr = ashr i32 %sub, 31			%shr = ashr i32 %sub, 31
	%and = and i32 %shr, %x			%and = and i32 %shr, %x
	store i32 %and, i32* %p			store i32 %and, i32* %p
	ret i32 %and			ret i32 %and
	▲ Show 20 Lines • Show All 57 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] Fold a shifty implementation of clamp-to-zero.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 221442

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

llvm/test/Transforms/InstCombine/sub-ashr-and-to-icmp-select.ll

[InstCombine] Fold a shifty implementation of clamp-to-zero.
ClosedPublic