This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
include/llvm/Analysis/
-
llvm/
-
Analysis/
1/1
TargetTransformInfo.h
-
lib/
-
CodeGen/
7/7
ExpandMemCmp.cpp
-
Target/X86/
-
X86/
5/11
X86TargetTransformInfo.cpp
-
test/
-
CodeGen/X86/
-
X86/
-
memcmp-optsize.ll
-
memcmp.ll
-
Transforms/ExpandMemCmp/X86/
-
ExpandMemCmp/
-
X86/
-
memcmp.ll

Differential D55263

[CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads.
ClosedPublic

Authored by courbet on Dec 4 2018, 4:50 AM.

Download Raw Diff

Details

Reviewers

spatel
gchatelet
andreadb
craig.topper
pcordes
RKSimon

Commits

rG1bb6e1b0f23d: [CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads.
rL349731: [CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads.

Summary

This allows expanding {7,11,13,14,15,21,22,23,25,26,27,28,29,30,31}-byte memcmp
in just two loads on X86. These were previously calling memcmp.

Diff Detail

Repository

rL LLVM

Build Status

Buildable 26018
Build 26017: arc lint + arc unit

Event Timeline

courbet created this revision.Dec 4 2018, 4:50 AM

Harbormaster completed remote builds in B25666: Diff 176606.Dec 4 2018, 4:52 AM

I just looked over the codegen changes so far, but I want to add some more knowledgeable x86 hackers to have a look too. There are 2 concerns:

Are there any known uarch problems with overlapping loads?
Are there any known uarch problems with unaligned accesses (either scalar or SSE)?

If there's any perf data (either nano-benchmarks or full apps) to support the changes, that would be nice to see. This reminds me of PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329 (can we close that now?)

In D55263#1320043, @spatel wrote:

I just looked over the codegen changes so far, but I want to add some more knowledgeable x86 hackers to have a look too. There are 2 concerns:

Are there any known uarch problems with overlapping loads?

Are there any known uarch problems with unaligned accesses (either scalar or SSE)?

FYI we're already using overlapping loads for memcpy lowering: https://godbolt.org/z/9iSE3g

Here's a basic benchmark for memcmp(a, b, N) where N is a compile-time constant, and a and b differ first at character M:

D55263.cc2 KBDownload

The change makes the impacted values 2.5 - 3x as fast.

"BMCmp<N, M>"	base	this change	speedup
"BM_Cmp<0, -1>"	0.293	0.292	1.003424658
"BM_Cmp<1, -1>"	0.64	0.64	1
"BM_Cmp<2, -1>"	0.637	0.636	1.001572327
"BM_Cmp<3, -1>"	1.08	1.08	1
"BM_Cmp<4, -1>"	0.637	0.637	1
"BM_Cmp<5, -1>"	1.08	1.08	1
"BM_Cmp<6, -1>"	1.08	1.07	1.009345794
"BM_Cmp<7, -1>"	2.82	1.03	2.737864078
"BM_Cmp<8, -1>"	0.637	0.637	1
"BM_Cmp<9, -1>"	1.08	1.08	1
"BM_Cmp<10, -1>"	1.08	1.07	1.009345794
"BM_Cmp<11, -1>"	3.08	1.03	2.990291262
"BM_Cmp<12, -1>"	1.03	1.03	1
"BM_Cmp<13, -1>"	3.08	1.03	2.990291262
"BM_Cmp<14, -1>"	3.09	1.03	3
"BM_Cmp<15, -1>"	3.08	1.03	2.990291262
"BM_Cmp<16, -1>"	0.843	0.844	0.9988151659
"BM_Cmp<17, -1>"	1.33	1.33	1
"BM_Cmp<18, -1>"	1.33	1.33	1
"BM_Cmp<19, -1>"	3.36	1.26	2.666666667
"BM_Cmp<20, -1>"	1.21	1.21	1
"BM_Cmp<21, -1>"	3.07	1.18	2.601694915
"BM_Cmp<22, -1>"	3.07	1.26	2.436507937
"BM_Cmp<23, -1>"	3.07	1.26	2.436507937
"BM_Cmp<24, -1>"	1.21	1.21	1
"BM_Cmp<25, -1>"	3.35	1.26	2.658730159
"BM_Cmp<26, -1>"	3.63	1.26	2.880952381
"BM_Cmp<27, -1>"	3.35	1.26	2.658730159
"BM_Cmp<28, -1>"	3.07	1.26	2.436507937
"BM_Cmp<29, -1>"	3.35	1.26	2.658730159
"BM_Cmp<30, -1>"	3.35	1.26	2.658730159
"BM_Cmp<31, -1>"	3.35	1.26	2.658730159
"BM_Cmp<32, -1>"	1.26	1.25	1.008
"BM_Cmp<0, 0>"	0.286	0.285	1.003508772
"BM_Cmp<1, 0>"	0.635	0.635	1
"BM_Cmp<2, 0>"	0.634	0.633	1.001579779
"BM_Cmp<3, 0>"	1.07	1.07	1
"BM_Cmp<4, 0>"	0.641	0.634	1.011041009
"BM_Cmp<5, 0>"	1.07	1.07	1
"BM_Cmp<6, 0>"	1.07	1.07	1
"BM_Cmp<7, 0>"	2.79	1.03	2.708737864
"BM_Cmp<8, 0>"	0.633	0.632	1.001582278
"BM_Cmp<9, 0>"	1.07	1.08	0.9907407407
"BM_Cmp<10, 0>"	1.08	1.07	1.009345794
"BM_Cmp<11, 0>"	3.08	1.03	2.990291262
"BM_Cmp<12, 0>"	1.04	1.03	1.009708738
"BM_Cmp<13, 0>"	3.1	1.03	3.009708738
"BM_Cmp<14, 0>"	3.09	1.03	3
"BM_Cmp<15, 0>"	3.09	1.03	3
"BM_Cmp<16, 0>"	0.844	0.843	1.00118624
"BM_Cmp<17, 0>"	1.33	1.32	1.007575758
"BM_Cmp<18, 0>"	1.33	1.32	1.007575758
"BM_Cmp<19, 0>"	3.37	1.26	2.674603175
"BM_Cmp<20, 0>"	1.22	1.21	1.008264463
"BM_Cmp<21, 0>"	3.09	1.26	2.452380952
"BM_Cmp<22, 0>"	3.08	1.26	2.444444444
"BM_Cmp<23, 0>"	3.07	1.26	2.436507937
"BM_Cmp<24, 0>"	1.21	1.21	1
"BM_Cmp<25, 0>"	3.35	1.26	2.658730159
"BM_Cmp<26, 0>"	3.63	1.27	2.858267717
"BM_Cmp<27, 0>"	3.35	1.26	2.658730159
"BM_Cmp<28, 0>"	3.07	1.26	2.436507937
"BM_Cmp<29, 0>"	3.35	1.26	2.658730159
"BM_Cmp<30, 0>"	3.35	1.26	2.658730159
"BM_Cmp<31, 0>"	3.36	1.26	2.666666667
"BM_Cmp<32, 0>"	1.26	1.26	1
"BM_Cmp<0, 7>"	0.289	0.287	1.006968641
"BM_Cmp<1, 7>"	0.64	0.635	1.007874016
"BM_Cmp<2, 7>"	0.638	0.633	1.007898894
"BM_Cmp<3, 7>"	1.08	1.07	1.009345794
"BM_Cmp<4, 7>"	0.634	0.635	0.9984251969
"BM_Cmp<5, 7>"	1.08	1.07	1.009345794
"BM_Cmp<6, 7>"	1.07	1.07	1
"BM_Cmp<7, 7>"	2.81	1.03	2.72815534
"BM_Cmp<8, 7>"	0.637	0.632	1.007911392
"BM_Cmp<9, 7>"	1.07	1.07	1
"BM_Cmp<10, 7>"	1.07	1.07	1
"BM_Cmp<11, 7>"	3.37	1.03	3.27184466
"BM_Cmp<12, 7>"	1.03	1.03	1
"BM_Cmp<13, 7>"	3.64	1.03	3.533980583
"BM_Cmp<14, 7>"	3.36	1.03	3.262135922
"BM_Cmp<15, 7>"	3.63	1.03	3.524271845
"BM_Cmp<16, 7>"	0.842	0.844	0.9976303318
"BM_Cmp<17, 7>"	1.33	1.33	1
"BM_Cmp<18, 7>"	1.33	1.33	1
"BM_Cmp<19, 7>"	3.63	1.26	2.880952381
"BM_Cmp<20, 7>"	1.21	1.21	1
"BM_Cmp<21, 7>"	3.93	1.26	3.119047619
"BM_Cmp<22, 7>"	3.9	1.26	3.095238095
"BM_Cmp<23, 7>"	3.93	1.25	3.144
"BM_Cmp<24, 7>"	1.22	1.21	1.008264463
"BM_Cmp<25, 7>"	3.92	1.26	3.111111111
"BM_Cmp<26, 7>"	3.63	1.26	2.880952381
"BM_Cmp<27, 7>"	3.92	1.26	3.111111111
"BM_Cmp<28, 7>"	3.63	1.26	2.880952381
"BM_Cmp<29, 7>"	3.93	1.26	3.119047619
"BM_Cmp<30, 7>"	3.93	1.26	3.119047619
"BM_Cmp<31, 7>"	3.93	1.26	3.119047619
"BM_Cmp<32, 7>"	1.26	1.26	1
"BM_Cmp<0, 15>"	0.287	0.287	1
"BM_Cmp<1, 15>"	0.637	0.635	1.003149606
"BM_Cmp<2, 15>"	0.633	0.631	1.003169572
"BM_Cmp<3, 15>"	1.08	1.07	1.009345794
"BM_Cmp<4, 15>"	0.634	0.633	1.001579779
"BM_Cmp<5, 15>"	1.08	1.07	1.009345794
"BM_Cmp<6, 15>"	1.07	1.07	1
"BM_Cmp<7, 15>"	2.79	1.03	2.708737864
"BM_Cmp<8, 15>"	0.635	0.64	0.9921875
"BM_Cmp<9, 15>"	1.07	1.08	0.9907407407
"BM_Cmp<10, 15>"	1.08	1.07	1.009345794
"BM_Cmp<11, 15>"	3.08	1.03	2.990291262
"BM_Cmp<12, 15>"	1.03	1.03	1
"BM_Cmp<13, 15>"	3.08	1.03	2.990291262
"BM_Cmp<14, 15>"	3.09	1.03	3
"BM_Cmp<15, 15>"	3.09	1.03	3
"BM_Cmp<16, 15>"	0.842	0.844	0.9976303318
"BM_Cmp<17, 15>"	1.33	1.33	1
"BM_Cmp<18, 15>"	1.32	1.33	0.992481203
"BM_Cmp<19, 15>"	3.63	1.26	2.880952381
"BM_Cmp<20, 15>"	1.21	1.21	1
"BM_Cmp<21, 15>"	3.91	1.26	3.103174603
"BM_Cmp<22, 15>"	3.92	1.26	3.111111111
"BM_Cmp<23, 15>"	3.94	1.26	3.126984127
"BM_Cmp<24, 15>"	1.22	1.21	1.008264463
"BM_Cmp<25, 15>"	3.91	1.26	3.103174603
"BM_Cmp<26, 15>"	3.63	1.26	2.880952381
"BM_Cmp<27, 15>"	3.92	1.26	3.111111111
"BM_Cmp<28, 15>"	3.65	1.26	2.896825397
"BM_Cmp<29, 15>"	3.93	1.25	3.144
"BM_Cmp<30, 15>"	3.93	1.26	3.119047619
"BM_Cmp<31, 15>"	3.92	1.26	3.111111111
"BM_Cmp<32, 15>"	1.26	1.26	1
"BM_Cmp<0, 24>"	0.285	0.286	0.9965034965
"BM_Cmp<1, 24>"	0.639	0.638	1.001567398
"BM_Cmp<2, 24>"	0.634	0.633	1.001579779
"BM_Cmp<3, 24>"	1.07	1.07	1
"BM_Cmp<4, 24>"	0.636	0.633	1.004739336
"BM_Cmp<5, 24>"	1.08	1.07	1.009345794
"BM_Cmp<6, 24>"	1.08	1.07	1.009345794
"BM_Cmp<7, 24>"	2.8	1.03	2.718446602
"BM_Cmp<8, 24>"	0.633	0.635	0.9968503937
"BM_Cmp<9, 24>"	1.07	1.08	0.9907407407
"BM_Cmp<10, 24>"	1.08	1.07	1.009345794
"BM_Cmp<11, 24>"	3.08	1.03	2.990291262
"BM_Cmp<12, 24>"	1.03	1.03	1
"BM_Cmp<13, 24>"	3.08	1.03	2.990291262
"BM_Cmp<14, 24>"	3.08	1.03	2.990291262
"BM_Cmp<15, 24>"	3.09	1.03	3
"BM_Cmp<16, 24>"	0.844	0.843	1.00118624
"BM_Cmp<17, 24>"	1.33	1.33	1
"BM_Cmp<18, 24>"	1.33	1.32	1.007575758
"BM_Cmp<19, 24>"	3.37	1.26	2.674603175
"BM_Cmp<20, 24>"	1.21	1.21	1
"BM_Cmp<21, 24>"	3.08	1.26	2.444444444
"BM_Cmp<22, 24>"	3.07	1.26	2.436507937
"BM_Cmp<23, 24>"	3.07	1.26	2.436507937
"BM_Cmp<24, 24>"	1.21	1.21	1
"BM_Cmp<25, 24>"	3.35	1.26	2.658730159
"BM_Cmp<26, 24>"	3.63	1.26	2.880952381
"BM_Cmp<27, 24>"	4.21	1.26	3.341269841
"BM_Cmp<28, 24>"	3.94	1.26	3.126984127
"BM_Cmp<29, 24>"	4.2	1.26	3.333333333
"BM_Cmp<30, 24>"	4.2	1.26	3.333333333
"BM_Cmp<31, 24>"	4.48	1.26	3.555555556
"BM_Cmp<32, 24>"	1.27	1.26	1.007936508

In D55263#1320172, @courbet wrote:

The change makes the impacted values 2.5 - 3x as fast.

Thanks - that's an impressive speedup. Which CPU uarch was that run on?

This is on haswell, but I expect other Intel chips to behave similarly.

JohnReagan added a subscriber: JohnReagan.Dec 5 2018, 12:53 PM

JohnReagan added inline comments.

lib/Target/X86/X86TargetTransformInfo.cpp
2903	Should this be guarded with hasSSE2()? Does it makes sense for -no-sse compiles?

One of my coworkers did an informal test last year and saw that newer Intel CPUs optimization of REP-string-op-instruction was faster than using SSE2 (he used large data sizes, not anything in the shorter ranges this patch deals with). Is that something that should be looked at? (or has somebody done that examination already)

In D55263#1320582, @JohnReagan wrote:

One of my coworkers did an informal test last year and saw that newer Intel CPUs optimization of REP-string-op-instruction was faster than using SSE2 (he used large data sizes, not anything in the shorter ranges this patch deals with). Is that something that should be looked at? (or has somebody done that examination already)

Yes, I'm planning to work on this next :) It should go in SelectionDAGTargetInfo::EmitTargetCodeForMemcmp(), similar to what we did for memcpy and memset though.

lib/Target/X86/X86TargetTransformInfo.cpp
2903	This should be on for all compiles, see e.g. the test case for N=7.

In D55263#1320172, @courbet wrote:

Here's a basic benchmark for memcmp(a, b, N) where N is a compile-time constant, and a and b differ first at character M:

D55263.cc2 KBDownload

The change makes the impacted values 2.5 - 3x as fast.

Nice patch Clement :-).

lib/Target/X86/X86TargetTransformInfo.cpp
2902	s/form/from. Strictly speaking, SSE1 provides MOVUPS for unaligned vector FP loads. However, it gets problematic when comparing vectors for equality; using CMPEQPS is not going to work as expected for the case where one of the operands is NaN. One of your tests shows that the expansion is effectively disabled if the target is SSE but not SSE2. However, as John wrote, I don't see where we check for feature SSE2 is done...

In D55263#1320582, @JohnReagan wrote:

One of my coworkers did an informal test last year and saw that newer Intel CPUs optimization of REP-string-op-instruction was faster than using SSE2 (he used large data sizes, not anything in the shorter ranges this patch deals with). Is that something that should be looked at? (or has somebody done that examination already)

Only rep movs and rep stos are fast (memcpy and memset) on current Intel and AMD.

repe cmpsb (memcmp) and repne scasb (memchr) run at worse than 2 or 1 cycle per compare (respectively) on mainstream Intel CPUs. The microcode simply loops 1 byte at a time. See Agner Fog's instruction tables (https://agner.org/optimize/)

AFAIK there's no plan to change this in future CPUs.

rep stos/movs might become useful even for short copies in I think IceLake with the expected short-rep feature, but I haven't heard of any plan to have optimized microcode for the compare functions with data-dependent stop conditions.

And yes, on CPUs with 256-bit or 512-bit data paths internally, rep stos/movs can take advantage of them and be faster than SSE2. (Close to with AVX or AVX512: a vector loop is often still best even on CPUs with the ERMSB feature.) See https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy

In D55263#1321020, @courbet wrote:

In D55263#1320582, @JohnReagan wrote:

One of my coworkers did an informal test last year and saw that newer Intel CPUs optimization of REP-string-op-instruction was faster than using SSE2 (he used large data sizes, not anything in the shorter ranges this patch deals with). Is that something that should be looked at? (or has somebody done that examination already)

Yes, I'm planning to work on this next :) It should go in SelectionDAGTargetInfo::EmitTargetCodeForMemcmp(), similar to what we did for memcpy and memset though.

As long as we don't enable it for AMD then I am fine.
Instructions with a REP prefix incur in a significant setup overhead. So, they are definitely to avoid if the repeat count is small. Even on larger data sets (At least on AMD) a loop of vector operations would still provide a better throughput than REP MOVS/CMPQ.

In D55263#1320043, @spatel wrote:

I just looked over the codegen changes so far, but I want to add some more knowledgeable x86 hackers to have a look too. There are 2 concerns:

Are there any known uarch problems with overlapping loads?

No, other than it implies unaligned. Even overlapping stores are fine, and are absorbed by the store buffer. (The 2 to 3x speedup reported on Haswell sounds totally reasonable.)

With very recently stored data, we might possibly be introducing store-forwarding stalls by misaligning a load relative to an earlier store. (Separate from the issue of absolute alignment.)

But if it was copies with a pair of overlapping loads/stores, then hopefully we load/store in an order that allows one load to fully overlap one of the stores that put the data there. (glibc memcpy uses a pair of overlapping loads + a pair of stores for sizes up to 2x the vector width. https://code.woboq.org/userspace/glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S.html#19 has nice comments describing the strategy. But I forget what happens for inlined memcpy for compile-time constant sizes with gcc and llvm. This is only relevant where memcmp can inline, and that's likely to be cases where a memcpy also inlined if there was a memcpy involved in the source at all.

Are there any known uarch problems with unaligned accesses (either scalar or SSE)?

*Unaligned* loads are a potential minor slowdown if they cross cache-line boundaries. (Or on AMD, maybe even 32-byte or even 16-byte boundaries). There is literally zero penalty when they don't cross any relevant boundary on modern CPUs (on Intel, that's 64-byte cache lines).

On Core2 and earlier, and K8 and earlier, movups or movdqu unaligned 16-byte loads are slowish even if the vector load doesn't cross a cache-line boundary. (The instruction decodes to multiple uops using a pessimistic strategy.) Nehalem and K10 have efficient unaligned vector loads. (Nehalem and Bulldozer have efficient unaligned vector *stores*.)

But I expect it's still worth it vs. a memcpy library function call, even on old CPUs for 16-byte vectors.

*Page* splits (4k boundary) are much slower on Intel before Skylake. Apparently Intel discovered that page splits in real life are more common than they had been tuning for, so they put extra hardware in Skylake to make the latency no worse than a cache-line split, and thoughput still decent, when both sides get TLB hits.

I tested some of this a while ago: https://stackoverflow.com/questions/45128763/how-can-i-accurately-benchmark-unaligned-access-speed-on-x86-64 That has a decent summary of the things to watch out for when worrying about unaligned loads.

On non-x86, I'm not sure how unaligned loads are handled in hardware. I know many ISAs do support them, like MIPS32r6 requires them, and I think AArch64. I can't comment on the efficiency. I think it takes a significant amount of transistors to make it as cheap as on modern x86. But maybe still worth it vs. spending more instructions. One unaligned load is probably not going to be much more than 2 or 3 aligned loads.

In D55263#1321181, @andreadb wrote:

Instructions with a REP prefix incur in a significant setup overhead. So, they are definitely to avoid if the repeat count is small. Even on larger data sets (At least on AMD) a loop of vector operations would still provide a better throughput than REP MOVS/CMPQ.

Intel has the same issue actually. IIRC we only lower to repmovs for very large sizes, when alwaysinline is true.

repe cmpsb (memcmp) and repne scasb (memchr) run at worse than 2 or 1 cycle per compare (respectively) on mainstream Intel CPUs. The microcode simply loops 1 byte at a time. See Agner Fog's instruction tables (https://agner.org/optimize/) AFAIK there's no plan to change this in future CPUs.

Thanks for the info.

lib/Target/X86/X86TargetTransformInfo.cpp
2902	If you look a few lines above, we only allow expansion with 16 bytes to happen if `ST->hasSSE2()`.

In D55263#1321184, @pcordes wrote:

In D55263#1320043, @spatel wrote:

I just looked over the codegen changes so far, but I want to add some more knowledgeable x86 hackers to have a look too. There are 2 concerns:
<snip>

Are there any known uarch problems with unaligned accesses (either scalar or SSE)?

*Unaligned* loads are a potential minor slowdown if they cross cache-line boundaries. (Or on AMD, maybe even 32-byte or even 16-byte boundaries). There is literally zero penalty when they don't cross any relevant boundary on modern CPUs (on Intel, that's 64-byte cache lines).

On AMD, a misaligned store or load operation suffers a minimum one-cycle penalty if it crosses a boundary (definitely 16-byte for AMD Family 15h/16h processors).

I also agree with Peter when he says that it is still worth it to pay a small penalty vs. doing a memcpy library call.
Speaking about memcpy: It is worth mentioning that - at least on Jaguar - the LS knows how to minimize the impact of stalls due to repeated misaligned accesses. Quoting the AMDfam15h SOG: "Writes to the Data cache which are unaligned in an "address" are written in two cycles. If consecutive unaligned addressed 128-bit loads are written they can be coalesced such that the 64-bit portions of 128-bit writes which were unaligned can be merged and written 128-bits at a time, removing most the stall penalties. This is performed in the Store Coalescing Buffer (SCB)."

Back on topic:
For x86, I think this patch is a nice improvement. Not sure about other targets.

lib/Target/X86/X86TargetTransformInfo.cpp
2902	Cheers.

davezarzycki added a subscriber: davezarzycki.Dec 8 2018, 11:51 AM

In D55263#1320043, @spatel wrote:

I just looked over the codegen changes so far, but I want to add some more knowledgeable x86 hackers to have a look too. There are 2 concerns:

Are there any known uarch problems with overlapping loads?

Are there any known uarch problems with unaligned accesses (either scalar or SSE)?

If there's any perf data (either nano-benchmarks or full apps) to support the changes, that would be nice to see. This reminds me of PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329 (can we close that now?)

Let's not lose sight of the big picture here. If uarch problems exist, are they *worse* than the cost of calling memcmp()? In other words, is the likely register spills, function call overhead, and dynamic algorithm selection (given the constantness of the size parameter is lost) worth it?

In D55263#1325283, @davezarzycki wrote:

In D55263#1320043, @spatel wrote:

I just looked over the codegen changes so far, but I want to add some more knowledgeable x86 hackers to have a look too. There are 2 concerns:

Are there any known uarch problems with overlapping loads?

Are there any known uarch problems with unaligned accesses (either scalar or SSE)?

If there's any perf data (either nano-benchmarks or full apps) to support the changes, that would be nice to see. This reminds me of PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329 (can we close that now?)

Let's not lose sight of the big picture here. If uarch problems exist, are they *worse* than the cost of calling memcmp()? In other words, is the likely register spills, function call overhead, and dynamic algorithm selection (given the constantness of the size parameter is lost) worth it?

No real argument from me - I just wanted to be sure that nobody knew of some pre-existing uarch disaster potential. The change should be a nice win for the general x86 case. I've just added some nits in the inine comments.

include/llvm/Analysis/TargetTransformInfo.h
584	Rephrase: "Set to true to allow overlapping loads. For example, ..."
lib/CodeGen/ExpandMemCmp.cpp
74–75	Could be independent of this patch, but it would be less cryptic to change "WRT" to "from" since we're making changes here.
157	This could use an explanatory comment and/or example with small numbers. We can assert that MaxLoadSize > 1 on entry to this function? If Size == 0 at this point, shouldn't we return immediately because that's not an overlapping sequence? Or can that be asserted? That would mean that the optimal greedy (non-overlapping) sequence was not already found.
259	Returns -> Return
lib/Target/X86/X86TargetTransformInfo.cpp
2892–2893	Independent of this patch, but I think this can be enabled. See: rL342989

In D55263#1325390, @spatel wrote:

In D55263#1325283, @davezarzycki wrote:

In D55263#1320043, @spatel wrote:

I just looked over the codegen changes so far, but I want to add some more knowledgeable x86 hackers to have a look too. There are 2 concerns:

Are there any known uarch problems with overlapping loads?

Are there any known uarch problems with unaligned accesses (either scalar or SSE)?

If there's any perf data (either nano-benchmarks or full apps) to support the changes, that would be nice to see. This reminds me of PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329 (can we close that now?)

Let's not lose sight of the big picture here. If uarch problems exist, are they *worse* than the cost of calling memcmp()? In other words, is the likely register spills, function call overhead, and dynamic algorithm selection (given the constantness of the size parameter is lost) worth it?

No real argument from me - I just wanted to be sure that nobody knew of some pre-existing uarch disaster potential. The change should be a nice win for the general x86 case. I've just added some nits in the inline comments.

Great. For whatever it may be worth, I'd wager that the micro-benchmark doesn't simulate branch prediction failures and therefore the performance win is higher in practice.

One of the challenges that is often overlooked with memcmp and memcpy (or any design with dynamic algorithm selection based on inputs) is that in the real world, inputs are often random from one call to the next, and therefore the algorithm selection branches are unpredictable.

In D55263#1325283, @davezarzycki wrote:

Let's not lose sight of the big picture here. If uarch problems exist, are they *worse* than the cost of calling memcmp()?

Almost certainly no, even for memcpy where potential store-forwarding stalls or 4k aliasing are a pretty minor concern most of the time.

I pointed those things out so the new unaligned load/store code-gen can be the best it can be while people are working on that code anyway, *not* because I think there's a risk of overall regressions.

In other words, is the likely register spills, function call overhead, and dynamic algorithm selection (given the constantness of the size parameter is lost) worth it?

Right, libc memcmp / memcpy are not cheap for tiny sizes. A couple cmp dword [rdi], imm32 / jne instructions should be better in almost every way, maybe even including code size at the call site depending on how many reloads we avoid.

By the way, LLVM itself will benefit nicely from this change. For example, the code gen for llvm::StringSwitch will improve dramatically.

In D55263#1327772, @pcordes wrote:

In D55263#1325283, @davezarzycki wrote:

Let's not lose sight of the big picture here. If uarch problems exist, are they *worse* than the cost of calling memcmp()?

Almost certainly no, even for memcpy where potential store-forwarding stalls or 4k aliasing are a pretty minor concern most of the time.

I pointed those things out so the new unaligned load/store code-gen can be the best it can be while people are working on that code anyway, *not* because I think there's a risk of overall regressions.

I always find Peter's comments very useful/informative.
I don't think that anybody is losing sight of the big picture here.

Sounds like everyone is in agreement about the overall direction. There are just a few inline comments/questions to answer, and then we should be good to go.

address comments

Herald added subscribers: JDevlieghere, javed.absar. · View Herald TranscriptDec 13 2018, 7:56 AM

Harbormaster completed remote builds in B25992: Diff 178070.Dec 13 2018, 7:56 AM

Unrelated diffs got uploaded?

rebase

Harbormaster completed remote builds in B26018: Diff 178185.Dec 13 2018, 11:23 PM

In D55263#1329984, @spatel wrote:

Unrelated diffs got uploaded?

Weird. They are not in my local commit. I guess it's a git-svn weirdness.

lib/Target/X86/X86TargetTransformInfo.cpp
2892–2893	Great, I'll look at it in a followup patch.

A couple of minor style issues I noticed

lib/CodeGen/ExpandMemCmp.cpp
141	Remove braces
167	Add brackets to make this logic more obvious - don't rely on people's understanding of operator precedence!

address Simon's comments.

Harbormaster completed remote builds in B26024: Diff 178206.Dec 14 2018, 2:10 AM

spatel added inline comments.Dec 14 2018, 6:11 AM

lib/CodeGen/ExpandMemCmp.cpp
166	I'm still not clear on this: if Size is 0, does that imply that computeGreedy failed?
lib/Target/X86/X86TargetTransformInfo.cpp
2902	form -> from (as suggested previously)

pcordes added inline comments.Dec 15 2018, 8:00 AM

lib/Target/X86/X86TargetTransformInfo.cpp
2892–2893	Be very careful of sprinkling small bits of 512-bit vector stuff into code that isn't already heavily using 512-bit vectors. It's fine for tune=KNL, but for Skylake-avx512 (and generic with `-mavx512f`) tuning keep in mind that executing one 512-bit vector instruction on Intel Skylake puts the whole core into AVX512 mode, reducing max turbo significantly and shutting down the vector ALUs on port 1. (So `vpaddd` throughput goes down from 3 to 2, for example). And on CPUs without a second 512-bit FMA unit (on port 5 with higher latency) that can be powered up, throughput on FP everything, integer multiplies and shifts, and many other instructions goes down too, even without considering the extra resource conflicts from having fewer vector ALU ports. (e.g. many Xeon Bronze chips have only 512-bit FMA). https://stackoverflow.com/questions/50517770/disable-avx-512-intrinsics-in-vs-2017#comment88055158_50517770 BTW, the integer ALUs on port1 stay active, so it can still run scalar integer stuff like popcnt even when it's shut down for instructions like `pxor`. I believe this happens even from just copying with `vmovdqa64`, even without using any 512-bit ALU instructions like `vpcmpb`. This can have a significant overall negative impact on code that's mostly scalar, and doesn't have many / any loops that benefit from 512-bit vectors. (Note that 256-bit vectors with AVX512VL can be great, taking advantage AVX512 mask registers and new instructions, and twice as many xmm/ymm registers with ymm16..31. You can even avoid VZEROUPPER for short non-looping uses of 256-bit registers, like for inline memcmp, by using only those new regs that can't be accesses with legacy SSE. At the minor cost of always needing the longer 4-byte EVEX encoding, not a 2 or 3 byte VEX prefix. Another possible downside is leaving more FPU state dirty for context switches: xsaveopt can omit saving upper halves of YMM regs if they're all clean. And possibly tying up more physical registers, but only vzeroall would avoid that. Each PRF entry is at least 256 bit wide, probably actually 512-bit on Intel CPUs. But make sure you never omit ZVEROUPPER after using a zmm register, otherwise the Intel CPUs will be stuck with slower-turbo: https://chat.stackoverflow.com/transcript/message/43768745#43768745 even though the port 1 vector ALU shutdown only lasts while 512-bit uops are actually in flight. Actually, BeeOnRope reported that dirtying ZMM16..31 didn't leave max-turbo affected, but using a 512-bit uop would still cause a downclock to power up AVX512 HW so we don't want to randomly use ZMM regs for that reason. Switching clocks takes 10s of thousands of cycles, so this is bad. Anyway, https://stackoverflow.com/questions/49019614/is-it-useful-to-use-vzeroupper-if-your-programlibraries-contain-no-sse-instruct mentions some of these side-benefits of vzeroupper)
2902	The comment is still giving the wrong reason: unaligned loads aren't the problem, lack of SIMD compare instructions are the reason we need SSE2 and AVX2, not SSE1 and AVX1, for 16 and 32-byte expansion of memcmp. How about: // All GPR and vector loads can be unaligned. SIMD compare requires integer vectors (SSE2/AVX2)

davezarzycki added inline comments.Dec 18 2018, 4:40 AM

lib/Target/X86/X86TargetTransformInfo.cpp
2892–2893	Hi @pcordes – Just FYI, the compiler already sprinkles in AVX512 for `memcpy` and `memset`. Also, auto-vectorization can sprinkle in unprofitable AVX512 code. From what I've seen, the advice seems to be: use `-mno-avx512f` if the sprinkled results aren't profitable.

address review comments

Sorry for the delay.

lib/CodeGen/ExpandMemCmp.cpp
166	A zero size indeed means that greedy should be optimal. The reason why I was handling this case is that it makes `computeOverlappingLoadSequence` stand by itself without having to refer to the greedy approach. But it's true that it duplicates work. So I changed it to bail out.

Harbormaster completed remote builds in B26139: Diff 178843.Dec 19 2018, 1:01 AM

LGTM - thanks for dealing with all of the nits. :)

The AVX512 discussion is fascinating, but independent, and as Dave mentioned, we may already be producing those vector ops in other places. Craig was looking at how to limit that usage...or maybe the hardware just gets better at dealing with it some day.

This revision is now accepted and ready to land.Dec 19 2018, 7:03 AM

Thank you all for the comments.

Closed by commit rL349731: [CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads. (authored by courbet). · Explain WhyDec 20 2018, 1:17 AM

This revision was automatically updated to reflect the committed changes.

davezarzycki mentioned this in D69044: [X86] Allow up to 4 loads per inline memcmp().Oct 18 2019, 8:45 AM

Revision Contents

Path

Size

include/

llvm/

Analysis/

TargetTransformInfo.h

8 lines

lib/

CodeGen/

ExpandMemCmp.cpp

232 lines

Target/

X86/

X86TargetTransformInfo.cpp

4 lines

test/

CodeGen/

X86/

memcmp-optsize.ll

73 lines

memcmp.ll

161 lines

Transforms/

ExpandMemCmp/

X86/

memcmp.ll

424 lines

Diff 178185

include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 575 Lines • ▼ Show 20 Lines	public:
/// Don't restrict interleaved unrolling to small loops.		/// Don't restrict interleaved unrolling to small loops.
bool enableAggressiveInterleaving(bool LoopHasReductions) const;		bool enableAggressiveInterleaving(bool LoopHasReductions) const;

/// If not nullptr, enable inline expansion of memcmp. IsZeroCmp is		/// If not nullptr, enable inline expansion of memcmp. IsZeroCmp is
/// true if this is the expansion of memcmp(p1, p2, s) == 0.		/// true if this is the expansion of memcmp(p1, p2, s) == 0.
struct MemCmpExpansionOptions {		struct MemCmpExpansionOptions {
// The list of available load sizes (in bytes), sorted in decreasing order.		// The list of available load sizes (in bytes), sorted in decreasing order.
SmallVector<unsigned, 8> LoadSizes;		SmallVector<unsigned, 8> LoadSizes;
		// Set to true to allow overlapping loads. For example, 7-byte compares can
		spatelUnsubmitted Done Reply Inline Actions Rephrase: "Set to true to allow overlapping loads. For example, ..." spatel: Rephrase: "Set to true to allow overlapping loads. For example, ..."
		// be done with two 4-byte compares instead of 4+2+1-byte compares. This
		// requires all loads in LoadSizes to be doable in an unaligned way.
		bool AllowOverlappingLoads = false;
};		};
const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;		const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;

/// Enable matching of interleaved access groups.		/// Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;		bool enableInterleavedAccessVectorization() const;

/// Enable matching of interleaved access groups that contain predicated		/// Enable matching of interleaved access groups that contain predicated
/// accesses or gaps and therefore vectorized using masked		/// accesses or gaps and therefore vectorized using masked
/// vector loads/stores.		/// vector loads/stores.
bool enableMaskedInterleavedAccessVectorization() const;		bool enableMaskedInterleavedAccessVectorization() const;

/// Indicate that it is potentially unsafe to automatically vectorize		/// Indicate that it is potentially unsafe to automatically vectorize
/// floating-point operations because the semantics of vector and scalar		/// floating-point operations because the semantics of vector and scalar
/// floating-point semantics may differ. For example, ARM NEON v7 SIMD math		/// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
/// does not support IEEE-754 denormal numbers, while depending on the		/// does not support IEEE-754 denormal numbers, while depending on the
▲ Show 20 Lines • Show All 168 Lines • ▼ Show 20 Lines	int getArithmeticInstrCost(
OperandValueKind Opd2Info = OK_AnyValue,		OperandValueKind Opd2Info = OK_AnyValue,
OperandValueProperties Opd1PropInfo = OP_None,		OperandValueProperties Opd1PropInfo = OP_None,
OperandValueProperties Opd2PropInfo = OP_None,		OperandValueProperties Opd2PropInfo = OP_None,
ArrayRef<const Value > Args = ArrayRef<const Value >()) const;		ArrayRef<const Value > Args = ArrayRef<const Value >()) const;

/// \return The cost of a shuffle instruction of kind Kind and of type Tp.		/// \return The cost of a shuffle instruction of kind Kind and of type Tp.
/// The index and subtype parameters are used by the subvector insertion and		/// The index and subtype parameters are used by the subvector insertion and
/// extraction shuffle kinds to show the insert/extract point and the type of		/// extraction shuffle kinds to show the insert/extract point and the type of
/// the subvector being inserted/extracted.		/// the subvector being inserted/extracted.
/// NOTE: For subvector extractions Tp represents the source type.		/// NOTE: For subvector extractions Tp represents the source type.
int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,		int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
Type *SubTp = nullptr) const;		Type *SubTp = nullptr) const;

/// \return The expected cost of cast instructions, such as bitcast, trunc,		/// \return The expected cost of cast instructions, such as bitcast, trunc,
/// zext, etc. If there is an existing instruction that holds Opcode, it		/// zext, etc. If there is an existing instruction that holds Opcode, it
/// may be passed in the 'I' parameter.		/// may be passed in the 'I' parameter.
int getCastInstrCost(unsigned Opcode, Type Dst, Type Src,		int getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
▲ Show 20 Lines • Show All 921 Lines • Show Last 20 Lines

lib/CodeGen/ExpandMemCmp.cpp

Show First 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	class MemCmpExpansion {
BasicBlock *EndBlock;		BasicBlock *EndBlock;
PHINode *PhiRes;		PHINode *PhiRes;
const bool IsUsedForZeroCmp;		const bool IsUsedForZeroCmp;
const DataLayout &DL;		const DataLayout &DL;
IRBuilder<> Builder;		IRBuilder<> Builder;
// Represents the decomposition in blocks of the expansion. For example,		// Represents the decomposition in blocks of the expansion. For example,
// comparing 33 bytes on X86+sse can be done with 2x16-byte loads and		// comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
// 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.		// 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
// TODO(courbet): Involve the target more in this computation. On X86, 7
// bytes can be done more efficiently with two overlaping 4-byte loads than
// covering the interval with [{4, 0},{2, 4},{1, 6}}.
struct LoadEntry {		struct LoadEntry {
LoadEntry(unsigned LoadSize, uint64_t Offset)		LoadEntry(unsigned LoadSize, uint64_t Offset)
: LoadSize(LoadSize), Offset(Offset) {		: LoadSize(LoadSize), Offset(Offset) {
assert(Offset % LoadSize == 0 && "invalid load entry");
}		}

uint64_t getGEPIndex() const { return Offset / LoadSize; }

// The size of the load for this block, in bytes.		// The size of the load for this block, in bytes.
const unsigned LoadSize;		unsigned LoadSize;
		spatelUnsubmitted Done Reply Inline Actions Could be independent of this patch, but it would be less cryptic to change "WRT" to "from" since we're making changes here. spatel: Could be independent of this patch, but it would be less cryptic to change "WRT" to "from"…
// The offset of this load WRT the base pointer, in bytes.		// The offset of this load from the base pointer, in bytes.
const uint64_t Offset;		uint64_t Offset;
};		};
SmallVector<LoadEntry, 8> LoadSequence;		using LoadEntryVector = SmallVector<LoadEntry, 8>;
		LoadEntryVector LoadSequence;

void createLoadCmpBlocks();		void createLoadCmpBlocks();
void createResultBlock();		void createResultBlock();
void setupResultBlockPHINodes();		void setupResultBlockPHINodes();
void setupEndBlockPHINodes();		void setupEndBlockPHINodes();
Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);		Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
void emitLoadCompareBlock(unsigned BlockIndex);		void emitLoadCompareBlock(unsigned BlockIndex);
void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,		void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
unsigned &LoadIndex);		unsigned &LoadIndex);
void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);		void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes);
void emitMemCmpResultBlock();		void emitMemCmpResultBlock();
Value *getMemCmpExpansionZeroCase();		Value *getMemCmpExpansionZeroCase();
Value *getMemCmpEqZeroOneBlock();		Value *getMemCmpEqZeroOneBlock();
Value *getMemCmpOneBlock();		Value *getMemCmpOneBlock();
		Value getPtrToElementAtOffset(Value Source, Type *LoadSizeType,
		uint64_t OffsetBytes);

		static LoadEntryVector
		computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
		unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte);
		static LoadEntryVector
		computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize,
		unsigned MaxNumLoads,
		unsigned &NumLoadsNonOneByte);

public:		public:
MemCmpExpansion(CallInst *CI, uint64_t Size,		MemCmpExpansion(CallInst *CI, uint64_t Size,
const TargetTransformInfo::MemCmpExpansionOptions &Options,		const TargetTransformInfo::MemCmpExpansionOptions &Options,
unsigned MaxNumLoads, const bool IsUsedForZeroCmp,		unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout);		unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout);

unsigned getNumBlocks();		unsigned getNumBlocks();
uint64_t getNumLoads() const { return LoadSequence.size(); }		uint64_t getNumLoads() const { return LoadSequence.size(); }

Value *getMemCmpExpansion();		Value *getMemCmpExpansion();
};		};

		MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence(
		uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
		const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) {
		NumLoadsNonOneByte = 0;
		LoadEntryVector LoadSequence;
		uint64_t Offset = 0;
		while (Size && !LoadSizes.empty()) {
		const unsigned LoadSize = LoadSizes.front();
		const uint64_t NumLoadsForThisSize = Size / LoadSize;
		if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
		// Do not expand if the total number of loads is larger than what the
		// target allows. Note that it's important that we exit before completing
		// the expansion to avoid using a ton of memory to store the expansion for
		// large sizes.
		return {};
		}
		if (NumLoadsForThisSize > 0) {
		for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
		LoadSequence.push_back({LoadSize, Offset});
		Offset += LoadSize;
		}
		if (LoadSize > 1) {
		++NumLoadsNonOneByte;
		}
		RKSimonUnsubmitted Done Reply Inline Actions Remove braces RKSimon: Remove braces
		Size = Size % LoadSize;
		}
		LoadSizes = LoadSizes.drop_front();
		}
		return LoadSequence;
		}

		MemCmpExpansion::LoadEntryVector
		MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
		const unsigned MaxLoadSize,
		const unsigned MaxNumLoads,
		unsigned &NumLoadsNonOneByte) {
		// These are already handled by the greedy approach.
		if (Size < 2 \|\| MaxLoadSize < 2)
		return {};

		spatelUnsubmitted Done Reply Inline Actions This could use an explanatory comment and/or example with small numbers. We can assert that MaxLoadSize > 1 on entry to this function? If Size == 0 at this point, shouldn't we return immediately because that's not an overlapping sequence? Or can that be asserted? That would mean that the optimal greedy (non-overlapping) sequence was not already found. spatel: This could use an explanatory comment and/or example with small numbers. We can assert that…
		// We try to do as many non-overlapping loads as possible starting from the
		// beginning.
		const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize;
		assert(NumNonOverlappingLoads && "there must be at least one load");
		// There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with
		// an overlapping load.
		Size = Size - NumNonOverlappingLoads * MaxLoadSize;
		// Bail if the number of loads (non-overlapping + potential overlapping one)
		// is larger than the max allowed.
		spatelUnsubmitted Done Reply Inline Actions I'm still not clear on this: if Size is 0, does that imply that computeGreedy failed? spatel: I'm still not clear on this: if Size is 0, does that imply that computeGreedy failed?
		courbetAuthorUnsubmitted Done Reply Inline Actions A zero size indeed means that greedy should be optimal. The reason why I was handling this case is that it makes `computeOverlappingLoadSequence` stand by itself without having to refer to the greedy approach. But it's true that it duplicates work. So I changed it to bail out. courbet: A zero size indeed means that greedy should be optimal. The reason why I was handling this case…
		if (NumNonOverlappingLoads + !!(Size > 0) > MaxNumLoads)
		RKSimonUnsubmitted Done Reply Inline Actions Add brackets to make this logic more obvious - don't rely on people's understanding of operator precedence! RKSimon: Add brackets to make this logic more obvious - don't rely on people's understanding of operator…
		return {};

		// Add non-overlapping loads.
		LoadEntryVector LoadSequence;
		uint64_t Offset = 0;
		for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) {
		LoadSequence.push_back({MaxLoadSize, Offset});
		Offset += MaxLoadSize;
		}

		// Add the last overlapping load.
		if (Size > 0) {
		assert(Size < MaxLoadSize && "broken invariant");
		LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)});
		}
		NumLoadsNonOneByte = 1;
		return LoadSequence;
		}

// Initialize the basic block structure required for expansion of memcmp call		// Initialize the basic block structure required for expansion of memcmp call
// with given maximum load size and memcmp size parameter.		// with given maximum load size and memcmp size parameter.
// This structure includes:		// This structure includes:
// 1. A list of load compare blocks - LoadCmpBlocks.		// 1. A list of load compare blocks - LoadCmpBlocks.
// 2. An EndBlock, split from original instruction point, which is the block to		// 2. An EndBlock, split from original instruction point, which is the block to
// return from.		// return from.
// 3. ResultBlock, block to branch to for early exit when a		// 3. ResultBlock, block to branch to for early exit when a
// LoadCmpBlock finds a difference.		// LoadCmpBlock finds a difference.
MemCmpExpansion::MemCmpExpansion(		MemCmpExpansion::MemCmpExpansion(
CallInst *const CI, uint64_t Size,		CallInst *const CI, uint64_t Size,
const TargetTransformInfo::MemCmpExpansionOptions &Options,		const TargetTransformInfo::MemCmpExpansionOptions &Options,
const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,		const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
const unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout)		const unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout)
: CI(CI),		: CI(CI),
Size(Size),		Size(Size),
MaxLoadSize(0),		MaxLoadSize(0),
NumLoadsNonOneByte(0),		NumLoadsNonOneByte(0),
NumLoadsPerBlockForZeroCmp(MaxLoadsPerBlockForZeroCmp),		NumLoadsPerBlockForZeroCmp(MaxLoadsPerBlockForZeroCmp),
IsUsedForZeroCmp(IsUsedForZeroCmp),		IsUsedForZeroCmp(IsUsedForZeroCmp),
DL(TheDataLayout),		DL(TheDataLayout),
Builder(CI) {		Builder(CI) {
assert(Size > 0 && "zero blocks");		assert(Size > 0 && "zero blocks");
// Scale the max size down if the target can load more bytes than we need.		// Scale the max size down if the target can load more bytes than we need.
size_t LoadSizeIndex = 0;		llvm::ArrayRef<unsigned> LoadSizes(Options.LoadSizes);
while (LoadSizeIndex < Options.LoadSizes.size() &&		while (!LoadSizes.empty() && LoadSizes.front() > Size) {
Options.LoadSizes[LoadSizeIndex] > Size) {		LoadSizes = LoadSizes.drop_front();
++LoadSizeIndex;
}		}
this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];		assert(!LoadSizes.empty() && "cannot load Size bytes");
		MaxLoadSize = LoadSizes.front();
// Compute the decomposition.		// Compute the decomposition.
uint64_t CurSize = Size;		unsigned GreedyNumLoadsNonOneByte = 0;
uint64_t Offset = 0;		LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, MaxNumLoads,
while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {		GreedyNumLoadsNonOneByte);
const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];		NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;
assert(LoadSize > 0 && "zero load size");		assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
const uint64_t NumLoadsForThisSize = CurSize / LoadSize;		// If we allow overlapping loads and the load sequence is not already optimal,
if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {		// use overlapping loads.
// Do not expand if the total number of loads is larger than what the		if (Options.AllowOverlappingLoads &&
// target allows. Note that it's important that we exit before completing		(LoadSequence.empty() \|\| LoadSequence.size() > 2)) {
// the expansion to avoid using a ton of memory to store the expansion for		unsigned OverlappingNumLoadsNonOneByte = 0;
// large sizes.		auto OverlappingLoads = computeOverlappingLoadSequence(
LoadSequence.clear();		Size, MaxLoadSize, MaxNumLoads, OverlappingNumLoadsNonOneByte);
return;		if (!OverlappingLoads.empty() &&
}		(LoadSequence.empty() \|\|
if (NumLoadsForThisSize > 0) {		OverlappingLoads.size() < LoadSequence.size())) {
for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {		LoadSequence = OverlappingLoads;
LoadSequence.push_back({LoadSize, Offset});		NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;
Offset += LoadSize;
}
if (LoadSize > 1) {
++NumLoadsNonOneByte;
}
CurSize = CurSize % LoadSize;
}		}
++LoadSizeIndex;
}		}
assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");		assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
}		}

unsigned MemCmpExpansion::getNumBlocks() {		unsigned MemCmpExpansion::getNumBlocks() {
if (IsUsedForZeroCmp)		if (IsUsedForZeroCmp)
return getNumLoads() / NumLoadsPerBlockForZeroCmp +		return getNumLoads() / NumLoadsPerBlockForZeroCmp +
(getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0);		(getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0);
return getNumLoads();		return getNumLoads();
}		}

void MemCmpExpansion::createLoadCmpBlocks() {		void MemCmpExpansion::createLoadCmpBlocks() {
for (unsigned i = 0; i < getNumBlocks(); i++) {		for (unsigned i = 0; i < getNumBlocks(); i++) {
BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",		BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
EndBlock->getParent(), EndBlock);		EndBlock->getParent(), EndBlock);
LoadCmpBlocks.push_back(BB);		LoadCmpBlocks.push_back(BB);
}		}
}		}

void MemCmpExpansion::createResultBlock() {		void MemCmpExpansion::createResultBlock() {
ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",		ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
EndBlock->getParent(), EndBlock);		EndBlock->getParent(), EndBlock);
}		}

		/// Return a pointer to an element of type `LoadSizeType` at offset
		spatelUnsubmitted Done Reply Inline Actions Returns -> Return spatel: Returns -> Return
		/// `OffsetBytes`.
		Value MemCmpExpansion::getPtrToElementAtOffset(Value Source,
		Type *LoadSizeType,
		uint64_t OffsetBytes) {
		if (OffsetBytes > 0) {
		auto *ByteType = Type::getInt8Ty(CI->getContext());
		Source = Builder.CreateGEP(
		ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()),
		ConstantInt::get(ByteType, OffsetBytes));
		}
		return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo());
		}

// This function creates the IR instructions for loading and comparing 1 byte.		// This function creates the IR instructions for loading and comparing 1 byte.
// It loads 1 byte from each source of the memcmp parameters with the given		// It loads 1 byte from each source of the memcmp parameters with the given
// GEPIndex. It then subtracts the two loaded values and adds this result to the		// GEPIndex. It then subtracts the two loaded values and adds this result to the
// final phi node for selecting the memcmp result.		// final phi node for selecting the memcmp result.
void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,		void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
unsigned GEPIndex) {		unsigned OffsetBytes) {
Value *Source1 = CI->getArgOperand(0);
Value *Source2 = CI->getArgOperand(1);

Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);		Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
Type *LoadSizeType = Type::getInt8Ty(CI->getContext());		Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
// Cast source to LoadSizeType*.		Value *Source1 =
if (Source1->getType() != LoadSizeType)		getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes);
Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());		Value *Source2 =
if (Source2->getType() != LoadSizeType)		getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes);
Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());

// Get the base address using the GEPIndex.
if (GEPIndex != 0) {
Source1 = Builder.CreateGEP(LoadSizeType, Source1,
ConstantInt::get(LoadSizeType, GEPIndex));
Source2 = Builder.CreateGEP(LoadSizeType, Source2,
ConstantInt::get(LoadSizeType, GEPIndex));
}

Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);		Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);		Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);

LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));		LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));		LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);		Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);

▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	IntegerType *const MaxLoadType =
NumLoads == 1 ? nullptr		NumLoads == 1 ? nullptr
: IntegerType::get(CI->getContext(), MaxLoadSize * 8);		: IntegerType::get(CI->getContext(), MaxLoadSize * 8);
for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {		for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];		const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];

IntegerType *LoadSizeType =		IntegerType *LoadSizeType =
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);		IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);

Value *Source1 = CI->getArgOperand(0);		Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
Value *Source2 = CI->getArgOperand(1);		CurLoadEntry.Offset);
		Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
// Cast source to LoadSizeType*.		CurLoadEntry.Offset);
if (Source1->getType() != LoadSizeType)
Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
if (Source2->getType() != LoadSizeType)
Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());

// Get the base address using a GEP.
if (CurLoadEntry.Offset != 0) {
Source1 = Builder.CreateGEP(
LoadSizeType, Source1,
ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
Source2 = Builder.CreateGEP(
LoadSizeType, Source2,
ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
}

// Get a constant or load a value for each source address.		// Get a constant or load a value for each source address.
Value *LoadSrc1 = nullptr;		Value *LoadSrc1 = nullptr;
if (auto *Source1C = dyn_cast<Constant>(Source1))		if (auto *Source1C = dyn_cast<Constant>(Source1))
LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);		LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
if (!LoadSrc1)		if (!LoadSrc1)
LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);		LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);

▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines
// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with		// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
// a special case through emitLoadCompareByteBlock. The special handling can		// a special case through emitLoadCompareByteBlock. The special handling can
// simply subtract the loaded values and add it to the result phi node.		// simply subtract the loaded values and add it to the result phi node.
void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {		void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
// There is one load per block in this case, BlockIndex == LoadIndex.		// There is one load per block in this case, BlockIndex == LoadIndex.
const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];		const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];

if (CurLoadEntry.LoadSize == 1) {		if (CurLoadEntry.LoadSize == 1) {
MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,		MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset);
CurLoadEntry.getGEPIndex());
return;		return;
}		}

Type *LoadSizeType =		Type *LoadSizeType =
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);		IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
Type MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize 8);		Type MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize 8);
assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");		assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");

Value *Source1 = CI->getArgOperand(0);
Value *Source2 = CI->getArgOperand(1);

Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);		Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
// Cast source to LoadSizeType*.
if (Source1->getType() != LoadSizeType)
Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
if (Source2->getType() != LoadSizeType)
Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());

// Get the base address using a GEP.		Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
if (CurLoadEntry.Offset != 0) {		CurLoadEntry.Offset);
Source1 = Builder.CreateGEP(		Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
LoadSizeType, Source1,		CurLoadEntry.Offset);
ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
Source2 = Builder.CreateGEP(
LoadSizeType, Source2,
ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
}

// Load LoadSizeType from the base address.		// Load LoadSizeType from the base address.
Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);		Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);		Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);

if (DL.isLittleEndian()) {		if (DL.isLittleEndian()) {
Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),		Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
Intrinsic::bswap, LoadSizeType);		Intrinsic::bswap, LoadSizeType);
▲ Show 20 Lines • Show All 271 Lines • ▼ Show 20 Lines	if (!SizeCast) {
NumMemCmpNotConstant++;		NumMemCmpNotConstant++;
return false;		return false;
}		}
const uint64_t SizeVal = SizeCast->getZExtValue();		const uint64_t SizeVal = SizeCast->getZExtValue();

if (SizeVal == 0) {		if (SizeVal == 0) {
return false;		return false;
}		}

// TTI call to check if target would like to expand memcmp. Also, get the		// TTI call to check if target would like to expand memcmp. Also, get the
// available load sizes.		// available load sizes.
const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);		const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);		const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
if (!Options) return false;		if (!Options) return false;

const unsigned MaxNumLoads =		const unsigned MaxNumLoads =
TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());		TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines

lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,880 Lines • ▼ Show 20 Lines	static const CostTblEntry SSE2CostTbl[] = {
{ ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/		{ ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/		{ ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
};		};
static const CostTblEntry SSE1CostTbl[] = {		static const CostTblEntry SSE1CostTbl[] = {
{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/		{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};		};
static const CostTblEntry X64CostTbl[] = { // 64-bit targets		static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::BITREVERSE, MVT::i64, 14 }		{ ISD::BITREVERSE, MVT::i64, 14 }
};		};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets		static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::BITREVERSE, MVT::i32, 14 },		{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },		{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 }		{ ISD::BITREVERSE, MVT::i8, 11 }
};		};

unsigned ISD = ISD::DELETED_NODE;		unsigned ISD = ISD::DELETED_NODE;
▲ Show 20 Lines • Show All 986 Lines • ▼ Show 20 Lines	static const auto ThreeWayOptions = [this]() {
}		}
Options.LoadSizes.push_back(4);		Options.LoadSizes.push_back(4);
Options.LoadSizes.push_back(2);		Options.LoadSizes.push_back(2);
Options.LoadSizes.push_back(1);		Options.LoadSizes.push_back(1);
return Options;		return Options;
}();		}();
static const auto EqZeroOptions = [this]() {		static const auto EqZeroOptions = [this]() {
TTI::MemCmpExpansionOptions Options;		TTI::MemCmpExpansionOptions Options;
// TODO: enable AVX512 when the DAG is ready.		// TODO: enable AVX512 when the DAG is ready.
// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);		// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
		spatelUnsubmitted Not Done Reply Inline Actions Independent of this patch, but I think this can be enabled. See: rL342989 spatel: Independent of this patch, but I think this can be enabled. See: rL342989
		courbetAuthorUnsubmitted Done Reply Inline Actions Great, I'll look at it in a followup patch. courbet: Great, I'll look at it in a followup patch.
		pcordesUnsubmitted Not Done Reply Inline Actions Be very careful of sprinkling small bits of 512-bit vector stuff into code that isn't already heavily using 512-bit vectors. It's fine for tune=KNL, but for Skylake-avx512 (and generic with `-mavx512f`) tuning keep in mind that executing one 512-bit vector instruction on Intel Skylake puts the whole core into AVX512 mode, reducing max turbo significantly and shutting down the vector ALUs on port 1. (So `vpaddd` throughput goes down from 3 to 2, for example). And on CPUs without a second 512-bit FMA unit (on port 5 with higher latency) that can be powered up, throughput on FP everything, integer multiplies and shifts, and many other instructions goes down too, even without considering the extra resource conflicts from having fewer vector ALU ports. (e.g. many Xeon Bronze chips have only 512-bit FMA). https://stackoverflow.com/questions/50517770/disable-avx-512-intrinsics-in-vs-2017#comment88055158_50517770 BTW, the integer ALUs on port1 stay active, so it can still run scalar integer stuff like popcnt even when it's shut down for instructions like `pxor`. I believe this happens even from just copying with `vmovdqa64`, even without using any 512-bit ALU instructions like `vpcmpb`. This can have a significant overall negative impact on code that's mostly scalar, and doesn't have many / any loops that benefit from 512-bit vectors. (Note that 256-bit vectors with AVX512VL can be great, taking advantage AVX512 mask registers and new instructions, and twice as many xmm/ymm registers with ymm16..31. You can even avoid VZEROUPPER for short non-looping uses of 256-bit registers, like for inline memcmp, by using only those new regs that can't be accesses with legacy SSE. At the minor cost of always needing the longer 4-byte EVEX encoding, not a 2 or 3 byte VEX prefix. Another possible downside is leaving more FPU state dirty for context switches: xsaveopt can omit saving upper halves of YMM regs if they're all clean. And possibly tying up more physical registers, but only vzeroall would avoid that. Each PRF entry is at least 256 bit wide, probably actually 512-bit on Intel CPUs. But make sure you never omit ZVEROUPPER after using a zmm register, otherwise the Intel CPUs will be stuck with slower-turbo: https://chat.stackoverflow.com/transcript/message/43768745#43768745 even though the port 1 vector ALU shutdown only lasts while 512-bit uops are actually in flight. Actually, BeeOnRope reported that dirtying ZMM16..31 didn't leave max-turbo affected, but using a 512-bit uop would still cause a downclock to power up AVX512 HW so we don't want to randomly use ZMM regs for that reason. Switching clocks takes 10s of thousands of cycles, so this is bad. Anyway, https://stackoverflow.com/questions/49019614/is-it-useful-to-use-vzeroupper-if-your-programlibraries-contain-no-sse-instruct mentions some of these side-benefits of vzeroupper) pcordes: Be very careful of sprinkling small bits of 512-bit vector stuff into code that isn't already…
		davezarzyckiUnsubmitted Not Done Reply Inline Actions Hi @pcordes – Just FYI, the compiler already sprinkles in AVX512 for `memcpy` and `memset`. Also, auto-vectorization can sprinkle in unprofitable AVX512 code. From what I've seen, the advice seems to be: use `-mno-avx512f` if the sprinkled results aren't profitable. davezarzycki: Hi @pcordes – Just FYI, the compiler already sprinkles in AVX512 for `memcpy` and `memset`.
if (ST->hasAVX2()) Options.LoadSizes.push_back(32);		if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
if (ST->hasSSE2()) Options.LoadSizes.push_back(16);		if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
if (ST->is64Bit()) {		if (ST->is64Bit()) {
Options.LoadSizes.push_back(8);		Options.LoadSizes.push_back(8);
}		}
Options.LoadSizes.push_back(4);		Options.LoadSizes.push_back(4);
Options.LoadSizes.push_back(2);		Options.LoadSizes.push_back(2);
Options.LoadSizes.push_back(1);		Options.LoadSizes.push_back(1);
		// All GPR loads can be unaligned, and vector loads too starting form SSE2.
		andreadbUnsubmitted Not Done Reply Inline Actions s/form/from. Strictly speaking, SSE1 provides MOVUPS for unaligned vector FP loads. However, it gets problematic when comparing vectors for equality; using CMPEQPS is not going to work as expected for the case where one of the operands is NaN. One of your tests shows that the expansion is effectively disabled if the target is SSE but not SSE2. However, as John wrote, I don't see where we check for feature SSE2 is done... andreadb: s/form/from. Strictly speaking, SSE1 provides MOVUPS for unaligned vector FP loads. However…
		courbetAuthorUnsubmitted Done Reply Inline Actions If you look a few lines above, we only allow expansion with 16 bytes to happen if `ST->hasSSE2()`. courbet: If you look a few lines above, we only allow expansion with 16 bytes to happen if `ST->hasSSE2…
		andreadbUnsubmitted Not Done Reply Inline Actions Cheers. andreadb: Cheers.
		pcordesUnsubmitted Done Reply Inline Actions The comment is still giving the wrong reason: unaligned loads aren't the problem, lack of SIMD compare instructions are the reason we need SSE2 and AVX2, not SSE1 and AVX1, for 16 and 32-byte expansion of memcmp. How about: // All GPR and vector loads can be unaligned. SIMD compare requires integer vectors (SSE2/AVX2) pcordes: The comment is still giving the wrong reason: unaligned loads aren't the problem, lack of SIMD…
		spatelUnsubmitted Done Reply Inline Actions form -> from (as suggested previously) spatel: form -> from (as suggested previously)
		Options.AllowOverlappingLoads = true;
		JohnReaganUnsubmitted Not Done Reply Inline Actions Should this be guarded with hasSSE2()? Does it makes sense for -no-sse compiles? JohnReagan: Should this be guarded with hasSSE2()? Does it makes sense for -no-sse compiles?
		courbetAuthorUnsubmitted Done Reply Inline Actions This should be on for all compiles, see e.g. the test case for N=7. courbet: This should be on for all compiles, see e.g. the test case for N=7.
return Options;		return Options;
}();		}();
return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;		return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
}		}

bool X86TTIImpl::enableInterleavedAccessVectorization() {		bool X86TTIImpl::enableInterleavedAccessVectorization() {
// TODO: We expect this to be beneficial regardless of arch,		// TODO: We expect this to be beneficial regardless of arch,
// but there are currently some unexplained performance artifacts on Atom.		// but there are currently some unexplained performance artifacts on Atom.
▲ Show 20 Lines • Show All 286 Lines • Show Last 20 Lines

test/CodeGen/X86/memcmp-optsize.ll

	Show First 20 Lines • Show All 633 Lines • ▼ Show 20 Lines
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: movl $24, %edx			; X64-NEXT: movl $24, %edx
	; X64-NEXT: jmp memcmp # TAILCALL			; X64-NEXT: jmp memcmp # TAILCALL
	%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
	ret i32 %m			ret i32 %m
	}			}

	define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {			define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
	; X86-LABEL: length24_eq:			; X86-NOSSE-LABEL: length24_eq:
	; X86: # %bb.0:			; X86-NOSSE: # %bb.0:
	; X86-NEXT: pushl $0			; X86-NOSSE-NEXT: pushl $0
	; X86-NEXT: pushl $24			; X86-NOSSE-NEXT: pushl $24
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NOSSE-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NOSSE-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NOSSE-NEXT: testl %eax, %eax
	; X86-NEXT: sete %al			; X86-NOSSE-NEXT: sete %al
	; X86-NEXT: retl			; X86-NOSSE-NEXT: retl
				;
				; X86-SSE2-LABEL: length24_eq:
				; X86-SSE2: # %bb.0:
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
				; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1
				; X86-SSE2-NEXT: movdqu (%eax), %xmm2
				; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
				; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0
				; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
				; X86-SSE2-NEXT: pand %xmm2, %xmm0
				; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
				; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X86-SSE2-NEXT: sete %al
				; X86-SSE2-NEXT: retl
	;			;
	; X64-SSE2-LABEL: length24_eq:			; X64-SSE2-LABEL: length24_eq:
	; X64-SSE2: # %bb.0:			; X64-SSE2: # %bb.0:
	; X64-SSE2-NEXT: movdqu (%rdi), %xmm0			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-SSE2-NEXT: movdqu (%rsi), %xmm1			; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
	; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1			; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
	; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero			; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
	; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero			; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
	Show All 17 Lines
	; X64-AVX2-NEXT: sete %al			; X64-AVX2-NEXT: sete %al
	; X64-AVX2-NEXT: retq			; X64-AVX2-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length24_eq_const(i8* %X) nounwind optsize {			define i1 @length24_eq_const(i8* %X) nounwind optsize {
	; X86-LABEL: length24_eq_const:			; X86-NOSSE-LABEL: length24_eq_const:
	; X86: # %bb.0:			; X86-NOSSE: # %bb.0:
	; X86-NEXT: pushl $0			; X86-NOSSE-NEXT: pushl $0
	; X86-NEXT: pushl $24			; X86-NOSSE-NEXT: pushl $24
	; X86-NEXT: pushl $.L.str			; X86-NOSSE-NEXT: pushl $.L.str
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NOSSE-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NOSSE-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NOSSE-NEXT: testl %eax, %eax
	; X86-NEXT: setne %al			; X86-NOSSE-NEXT: setne %al
	; X86-NEXT: retl			; X86-NOSSE-NEXT: retl
				;
				; X86-SSE2-LABEL: length24_eq_const:
				; X86-SSE2: # %bb.0:
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-SSE2-NEXT: movdqu (%eax), %xmm0
				; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1
				; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1
				; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
				; X86-SSE2-NEXT: pand %xmm1, %xmm0
				; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
				; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X86-SSE2-NEXT: setne %al
				; X86-SSE2-NEXT: retl
	;			;
	; X64-SSE2-LABEL: length24_eq_const:			; X64-SSE2-LABEL: length24_eq_const:
	; X64-SSE2: # %bb.0:			; X64-SSE2: # %bb.0:
	; X64-SSE2-NEXT: movdqu (%rdi), %xmm0			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero			; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
	; X64-SSE2-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736			; X64-SSE2-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736
	; X64-SSE2-NEXT: movq %rax, %xmm2			; X64-SSE2-NEXT: movq %rax, %xmm2
	; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm2			; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm2
	▲ Show 20 Lines • Show All 255 Lines • Show Last 20 Lines

test/CodeGen/X86/memcmp.ll

Show First 20 Lines • Show All 356 Lines • ▼ Show 20 Lines	; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind		%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
%c = icmp ne i32 %m, 0		%c = icmp ne i32 %m, 0
ret i1 %c		ret i1 %c
}		}

define i1 @length7_eq(i8* %X, i8* %Y) nounwind {		define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length7_eq:		; X86-LABEL: length7_eq:
; X86: # %bb.0:		; X86: # %bb.0:
; X86-NEXT: pushl $0		; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pushl $7		; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NEXT: movl 3(%ecx), %ecx
; X86-NEXT: calll memcmp		; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: addl $16, %esp		; X86-NEXT: xorl 3(%eax), %ecx
; X86-NEXT: testl %eax, %eax		; X86-NEXT: orl %edx, %ecx
; X86-NEXT: setne %al		; X86-NEXT: setne %al
; X86-NEXT: retl		; X86-NEXT: retl
;		;
; X64-LABEL: length7_eq:		; X64-LABEL: length7_eq:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: pushq %rax		; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl $7, %edx		; X64-NEXT: movl 3(%rdi), %ecx
; X64-NEXT: callq memcmp		; X64-NEXT: xorl (%rsi), %eax
; X64-NEXT: testl %eax, %eax		; X64-NEXT: xorl 3(%rsi), %ecx
		; X64-NEXT: orl %eax, %ecx
; X64-NEXT: setne %al		; X64-NEXT: setne %al
; X64-NEXT: popq %rcx
; X64-NEXT: retq		; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind		%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
%c = icmp ne i32 %m, 0		%c = icmp ne i32 %m, 0
ret i1 %c		ret i1 %c
}		}

define i32 @length8(i8* %X, i8* %Y) nounwind {		define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8:		; X86-LABEL: length8:
▲ Show 20 Lines • Show All 152 Lines • ▼ Show 20 Lines
; X86-NEXT: calll memcmp		; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp		; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax		; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al		; X86-NEXT: sete %al
; X86-NEXT: retl		; X86-NEXT: retl
;		;
; X64-LABEL: length11_eq:		; X64-LABEL: length11_eq:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: pushq %rax		; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movl $11, %edx		; X64-NEXT: movq 3(%rdi), %rcx
; X64-NEXT: callq memcmp		; X64-NEXT: xorq (%rsi), %rax
; X64-NEXT: testl %eax, %eax		; X64-NEXT: xorq 3(%rsi), %rcx
		; X64-NEXT: orq %rax, %rcx
; X64-NEXT: sete %al		; X64-NEXT: sete %al
; X64-NEXT: popq %rcx
; X64-NEXT: retq		; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 11) nounwind		%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 11) nounwind
%c = icmp eq i32 %m, 0		%c = icmp eq i32 %m, 0
ret i1 %c		ret i1 %c
}		}

define i1 @length12_eq(i8* %X, i8* %Y) nounwind {		define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length12_eq:		; X86-LABEL: length12_eq:
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines
; X86-NEXT: calll memcmp		; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp		; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax		; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al		; X86-NEXT: sete %al
; X86-NEXT: retl		; X86-NEXT: retl
;		;
; X64-LABEL: length13_eq:		; X64-LABEL: length13_eq:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: pushq %rax		; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movl $13, %edx		; X64-NEXT: movq 5(%rdi), %rcx
; X64-NEXT: callq memcmp		; X64-NEXT: xorq (%rsi), %rax
; X64-NEXT: testl %eax, %eax		; X64-NEXT: xorq 5(%rsi), %rcx
		; X64-NEXT: orq %rax, %rcx
; X64-NEXT: sete %al		; X64-NEXT: sete %al
; X64-NEXT: popq %rcx
; X64-NEXT: retq		; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 13) nounwind		%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 13) nounwind
%c = icmp eq i32 %m, 0		%c = icmp eq i32 %m, 0
ret i1 %c		ret i1 %c
}		}

define i1 @length14_eq(i8* %X, i8* %Y) nounwind {		define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length14_eq:		; X86-LABEL: length14_eq:
; X86: # %bb.0:		; X86: # %bb.0:
; X86-NEXT: pushl $0		; X86-NEXT: pushl $0
; X86-NEXT: pushl $14		; X86-NEXT: pushl $14
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp		; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp		; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax		; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al		; X86-NEXT: sete %al
; X86-NEXT: retl		; X86-NEXT: retl
;		;
; X64-LABEL: length14_eq:		; X64-LABEL: length14_eq:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: pushq %rax		; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movl $14, %edx		; X64-NEXT: movq 6(%rdi), %rcx
; X64-NEXT: callq memcmp		; X64-NEXT: xorq (%rsi), %rax
; X64-NEXT: testl %eax, %eax		; X64-NEXT: xorq 6(%rsi), %rcx
		; X64-NEXT: orq %rax, %rcx
; X64-NEXT: sete %al		; X64-NEXT: sete %al
; X64-NEXT: popq %rcx
; X64-NEXT: retq		; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 14) nounwind		%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 14) nounwind
%c = icmp eq i32 %m, 0		%c = icmp eq i32 %m, 0
ret i1 %c		ret i1 %c
}		}

define i1 @length15_eq(i8* %X, i8* %Y) nounwind {		define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length15_eq:		; X86-LABEL: length15_eq:
; X86: # %bb.0:		; X86: # %bb.0:
; X86-NEXT: pushl $0		; X86-NEXT: pushl $0
; X86-NEXT: pushl $15		; X86-NEXT: pushl $15
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp		; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp		; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax		; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al		; X86-NEXT: sete %al
; X86-NEXT: retl		; X86-NEXT: retl
;		;
; X64-LABEL: length15_eq:		; X64-LABEL: length15_eq:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: pushq %rax		; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movl $15, %edx		; X64-NEXT: movq 7(%rdi), %rcx
; X64-NEXT: callq memcmp		; X64-NEXT: xorq (%rsi), %rax
; X64-NEXT: testl %eax, %eax		; X64-NEXT: xorq 7(%rsi), %rcx
		; X64-NEXT: orq %rax, %rcx
; X64-NEXT: sete %al		; X64-NEXT: sete %al
; X64-NEXT: popq %rcx
; X64-NEXT: retq		; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind		%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind
%c = icmp eq i32 %m, 0		%c = icmp eq i32 %m, 0
ret i1 %c		ret i1 %c
}		}

; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329		; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329

▲ Show 20 Lines • Show All 169 Lines • ▼ Show 20 Lines
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: movl $24, %edx		; X64-NEXT: movl $24, %edx
; X64-NEXT: jmp memcmp # TAILCALL		; X64-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind		%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
ret i32 %m		ret i32 %m
}		}

define i1 @length24_eq(i8* %x, i8* %y) nounwind {		define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length24_eq:		; X86-NOSSE-LABEL: length24_eq:
; X86: # %bb.0:		; X86-NOSSE: # %bb.0:
; X86-NEXT: pushl $0		; X86-NOSSE-NEXT: pushl $0
; X86-NEXT: pushl $24		; X86-NOSSE-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp		; X86-NOSSE-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp		; X86-NOSSE-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax		; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NEXT: sete %al		; X86-NOSSE-NEXT: sete %al
; X86-NEXT: retl		; X86-NOSSE-NEXT: retl
		;
		; X86-SSE1-LABEL: length24_eq:
		; X86-SSE1: # %bb.0:
		; X86-SSE1-NEXT: pushl $0
		; X86-SSE1-NEXT: pushl $24
		; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
		; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
		; X86-SSE1-NEXT: calll memcmp
		; X86-SSE1-NEXT: addl $16, %esp
		; X86-SSE1-NEXT: testl %eax, %eax
		; X86-SSE1-NEXT: sete %al
		; X86-SSE1-NEXT: retl
		;
		; X86-SSE2-LABEL: length24_eq:
		; X86-SSE2: # %bb.0:
		; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
		; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
		; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1
		; X86-SSE2-NEXT: movdqu (%eax), %xmm2
		; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
		; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0
		; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
		; X86-SSE2-NEXT: pand %xmm2, %xmm0
		; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
		; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
		; X86-SSE2-NEXT: sete %al
		; X86-SSE2-NEXT: retl
;		;
; X64-SSE2-LABEL: length24_eq:		; X64-SSE2-LABEL: length24_eq:
; X64-SSE2: # %bb.0:		; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0		; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1		; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1		; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero		; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero		; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
Show All 17 Lines
; X64-AVX-NEXT: sete %al		; X64-AVX-NEXT: sete %al
; X64-AVX-NEXT: retq		; X64-AVX-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind		%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
%cmp = icmp eq i32 %call, 0		%cmp = icmp eq i32 %call, 0
ret i1 %cmp		ret i1 %cmp
}		}

define i1 @length24_eq_const(i8* %X) nounwind {		define i1 @length24_eq_const(i8* %X) nounwind {
; X86-LABEL: length24_eq_const:		; X86-NOSSE-LABEL: length24_eq_const:
; X86: # %bb.0:		; X86-NOSSE: # %bb.0:
; X86-NEXT: pushl $0		; X86-NOSSE-NEXT: pushl $0
; X86-NEXT: pushl $24		; X86-NOSSE-NEXT: pushl $24
; X86-NEXT: pushl $.L.str		; X86-NOSSE-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)		; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp		; X86-NOSSE-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp		; X86-NOSSE-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax		; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NEXT: setne %al		; X86-NOSSE-NEXT: setne %al
; X86-NEXT: retl		; X86-NOSSE-NEXT: retl
		;
		; X86-SSE1-LABEL: length24_eq_const:
		; X86-SSE1: # %bb.0:
		; X86-SSE1-NEXT: pushl $0
		; X86-SSE1-NEXT: pushl $24
		; X86-SSE1-NEXT: pushl $.L.str
		; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
		; X86-SSE1-NEXT: calll memcmp
		; X86-SSE1-NEXT: addl $16, %esp
		; X86-SSE1-NEXT: testl %eax, %eax
		; X86-SSE1-NEXT: setne %al
		; X86-SSE1-NEXT: retl
		;
		; X86-SSE2-LABEL: length24_eq_const:
		; X86-SSE2: # %bb.0:
		; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
		; X86-SSE2-NEXT: movdqu (%eax), %xmm0
		; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1
		; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1
		; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
		; X86-SSE2-NEXT: pand %xmm1, %xmm0
		; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
		; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
		; X86-SSE2-NEXT: setne %al
		; X86-SSE2-NEXT: retl
;		;
; X64-SSE2-LABEL: length24_eq_const:		; X64-SSE2-LABEL: length24_eq_const:
; X64-SSE2: # %bb.0:		; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0		; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero		; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-SSE2-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736		; X64-SSE2-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736
; X64-SSE2-NEXT: movq %rax, %xmm2		; X64-SSE2-NEXT: movq %rax, %xmm2
; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm2		; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm2
▲ Show 20 Lines • Show All 343 Lines • Show Last 20 Lines

test/Transforms/ExpandMemCmp/X86/memcmp.ll

	Show First 20 Lines • Show All 124 Lines • ▼ Show 20 Lines
	; ALL-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i32			; ALL-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i32
	; ALL-NEXT: [[TMP5:%.]] = load i32, i32 [[TMP3]]			; ALL-NEXT: [[TMP5:%.]] = load i32, i32 [[TMP3]]
	; ALL-NEXT: [[TMP6:%.]] = load i32, i32 [[TMP4]]			; ALL-NEXT: [[TMP6:%.]] = load i32, i32 [[TMP4]]
	; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])			; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
	; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])			; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
	; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]			; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
	; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]			; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
	; ALL: loadbb1:			; ALL: loadbb1:
	; ALL-NEXT: [[TMP10:%.]] = bitcast i8 [[X]] to i16*			; ALL-NEXT: [[TMP10:%.]] = getelementptr i8, i8 [[X]], i8 4
	; ALL-NEXT: [[TMP11:%.]] = bitcast i8 [[Y]] to i16*			; ALL-NEXT: [[TMP11:%.]] = bitcast i8 [[TMP10]] to i16*
	; ALL-NEXT: [[TMP12:%.]] = getelementptr i16, i16 [[TMP10]], i16 2			; ALL-NEXT: [[TMP12:%.]] = getelementptr i8, i8 [[Y]], i8 4
	; ALL-NEXT: [[TMP13:%.]] = getelementptr i16, i16 [[TMP11]], i16 2			; ALL-NEXT: [[TMP13:%.]] = bitcast i8 [[TMP12]] to i16*
	; ALL-NEXT: [[TMP14:%.]] = load i16, i16 [[TMP12]]			; ALL-NEXT: [[TMP14:%.]] = load i16, i16 [[TMP11]]
	; ALL-NEXT: [[TMP15:%.]] = load i16, i16 [[TMP13]]			; ALL-NEXT: [[TMP15:%.]] = load i16, i16 [[TMP13]]
	; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])			; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
	; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])			; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
	; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32			; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32
	; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32			; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32
	; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]			; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
	; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]			; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
	; ALL: endblock:			; ALL: endblock:
	Show All 27 Lines
	; X32-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i32			; X32-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i32
	; X32-NEXT: [[TMP5:%.]] = load i32, i32 [[TMP3]]			; X32-NEXT: [[TMP5:%.]] = load i32, i32 [[TMP3]]
	; X32-NEXT: [[TMP6:%.]] = load i32, i32 [[TMP4]]			; X32-NEXT: [[TMP6:%.]] = load i32, i32 [[TMP4]]
	; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])			; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
	; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])			; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
	; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]			; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
	; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]			; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
	; X32: loadbb1:			; X32: loadbb1:
	; X32-NEXT: [[TMP10:%.]] = bitcast i8 [[X]] to i32*			; X32-NEXT: [[TMP10:%.]] = getelementptr i8, i8 [[X]], i8 4
	; X32-NEXT: [[TMP11:%.]] = bitcast i8 [[Y]] to i32*			; X32-NEXT: [[TMP11:%.]] = bitcast i8 [[TMP10]] to i32*
	; X32-NEXT: [[TMP12:%.]] = getelementptr i32, i32 [[TMP10]], i32 1			; X32-NEXT: [[TMP12:%.]] = getelementptr i8, i8 [[Y]], i8 4
	; X32-NEXT: [[TMP13:%.]] = getelementptr i32, i32 [[TMP11]], i32 1			; X32-NEXT: [[TMP13:%.]] = bitcast i8 [[TMP12]] to i32*
	; X32-NEXT: [[TMP14:%.]] = load i32, i32 [[TMP12]]			; X32-NEXT: [[TMP14:%.]] = load i32, i32 [[TMP11]]
	; X32-NEXT: [[TMP15:%.]] = load i32, i32 [[TMP13]]			; X32-NEXT: [[TMP15:%.]] = load i32, i32 [[TMP13]]
	; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])			; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
	; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])			; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
	; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]			; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
	; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]			; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
	; X32: endblock:			; X32: endblock:
	; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]			; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
	; X32-NEXT: ret i32 [[PHI_RES]]			; X32-NEXT: ret i32 [[PHI_RES]]
	▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines
	; X64-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i64			; X64-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i64
	; X64-NEXT: [[TMP5:%.]] = load i64, i64 [[TMP3]]			; X64-NEXT: [[TMP5:%.]] = load i64, i64 [[TMP3]]
	; X64-NEXT: [[TMP6:%.]] = load i64, i64 [[TMP4]]			; X64-NEXT: [[TMP6:%.]] = load i64, i64 [[TMP4]]
	; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])			; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
	; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])			; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
	; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]			; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
	; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]			; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
	; X64: loadbb1:			; X64: loadbb1:
	; X64-NEXT: [[TMP10:%.]] = bitcast i8 [[X]] to i16*			; X64-NEXT: [[TMP10:%.]] = getelementptr i8, i8 [[X]], i8 8
	; X64-NEXT: [[TMP11:%.]] = bitcast i8 [[Y]] to i16*			; X64-NEXT: [[TMP11:%.]] = bitcast i8 [[TMP10]] to i16*
	; X64-NEXT: [[TMP12:%.]] = getelementptr i16, i16 [[TMP10]], i16 4			; X64-NEXT: [[TMP12:%.]] = getelementptr i8, i8 [[Y]], i8 8
	; X64-NEXT: [[TMP13:%.]] = getelementptr i16, i16 [[TMP11]], i16 4			; X64-NEXT: [[TMP13:%.]] = bitcast i8 [[TMP12]] to i16*
	; X64-NEXT: [[TMP14:%.]] = load i16, i16 [[TMP12]]			; X64-NEXT: [[TMP14:%.]] = load i16, i16 [[TMP11]]
	; X64-NEXT: [[TMP15:%.]] = load i16, i16 [[TMP13]]			; X64-NEXT: [[TMP15:%.]] = load i16, i16 [[TMP13]]
	; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])			; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
	; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])			; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
	; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64			; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64
	; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64			; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64
	; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]			; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
	; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]			; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
	; X64: endblock:			; X64: endblock:
	Show All 31 Lines
	; X64-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i64			; X64-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i64
	; X64-NEXT: [[TMP5:%.]] = load i64, i64 [[TMP3]]			; X64-NEXT: [[TMP5:%.]] = load i64, i64 [[TMP3]]
	; X64-NEXT: [[TMP6:%.]] = load i64, i64 [[TMP4]]			; X64-NEXT: [[TMP6:%.]] = load i64, i64 [[TMP4]]
	; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])			; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
	; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])			; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
	; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]			; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
	; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]			; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
	; X64: loadbb1:			; X64: loadbb1:
	; X64-NEXT: [[TMP10:%.]] = bitcast i8 [[X]] to i32*			; X64-NEXT: [[TMP10:%.]] = getelementptr i8, i8 [[X]], i8 8
	; X64-NEXT: [[TMP11:%.]] = bitcast i8 [[Y]] to i32*			; X64-NEXT: [[TMP11:%.]] = bitcast i8 [[TMP10]] to i32*
	; X64-NEXT: [[TMP12:%.]] = getelementptr i32, i32 [[TMP10]], i32 2			; X64-NEXT: [[TMP12:%.]] = getelementptr i8, i8 [[Y]], i8 8
	; X64-NEXT: [[TMP13:%.]] = getelementptr i32, i32 [[TMP11]], i32 2			; X64-NEXT: [[TMP13:%.]] = bitcast i8 [[TMP12]] to i32*
	; X64-NEXT: [[TMP14:%.]] = load i32, i32 [[TMP12]]			; X64-NEXT: [[TMP14:%.]] = load i32, i32 [[TMP11]]
	; X64-NEXT: [[TMP15:%.]] = load i32, i32 [[TMP13]]			; X64-NEXT: [[TMP15:%.]] = load i32, i32 [[TMP13]]
	; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])			; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
	; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])			; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
	; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64			; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64
	; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64			; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64
	; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]			; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
	; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]			; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
	; X64: endblock:			; X64: endblock:
	▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	; X64-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i64			; X64-NEXT: [[TMP4:%.]] = bitcast i8 [[Y:%.]] to i64
	; X64-NEXT: [[TMP5:%.]] = load i64, i64 [[TMP3]]			; X64-NEXT: [[TMP5:%.]] = load i64, i64 [[TMP3]]
	; X64-NEXT: [[TMP6:%.]] = load i64, i64 [[TMP4]]			; X64-NEXT: [[TMP6:%.]] = load i64, i64 [[TMP4]]
	; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])			; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
	; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])			; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
	; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]			; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
	; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]			; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
	; X64: loadbb1:			; X64: loadbb1:
	; X64-NEXT: [[TMP10:%.]] = bitcast i8 [[X]] to i64*			; X64-NEXT: [[TMP10:%.]] = getelementptr i8, i8 [[X]], i8 8
	; X64-NEXT: [[TMP11:%.]] = bitcast i8 [[Y]] to i64*			; X64-NEXT: [[TMP11:%.]] = bitcast i8 [[TMP10]] to i64*
	; X64-NEXT: [[TMP12:%.]] = getelementptr i64, i64 [[TMP10]], i64 1			; X64-NEXT: [[TMP12:%.]] = getelementptr i8, i8 [[Y]], i8 8
	; X64-NEXT: [[TMP13:%.]] = getelementptr i64, i64 [[TMP11]], i64 1			; X64-NEXT: [[TMP13:%.]] = bitcast i8 [[TMP12]] to i64*
	; X64-NEXT: [[TMP14:%.]] = load i64, i64 [[TMP12]]			; X64-NEXT: [[TMP14:%.]] = load i64, i64 [[TMP11]]
	; X64-NEXT: [[TMP15:%.]] = load i64, i64 [[TMP13]]			; X64-NEXT: [[TMP15:%.]] = load i64, i64 [[TMP13]]
	; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])			; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
	; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])			; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
	; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]			; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
	; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]			; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
	; X64: endblock:			; X64: endblock:
	; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]			; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
	; X64-NEXT: ret i32 [[PHI_RES]]			; X64-NEXT: ret i32 [[PHI_RES]]
	▲ Show 20 Lines • Show All 182 Lines • ▼ Show 20 Lines

	define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; X32-LABEL: @cmp_eq6(			; X32-LABEL: @cmp_eq6(
	; X32-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32			; X32-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32
	; X32-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32			; X32-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32
	; X32-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]			; X32-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]
	; X32-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]			; X32-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]
	; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]			; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
	; X32-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i16*			; X32-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 4
	; X32-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i16*			; X32-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i16*
	; X32-NEXT: [[TMP8:%.]] = getelementptr i16, i16 [[TMP6]], i16 2			; X32-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 4
	; X32-NEXT: [[TMP9:%.]] = getelementptr i16, i16 [[TMP7]], i16 2			; X32-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i16*
	; X32-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP8]]			; X32-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP7]]
	; X32-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]			; X32-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]
	; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32			; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
	; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32			; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
	; X32-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]			; X32-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
	; X32-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]			; X32-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
	; X32-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0			; X32-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
	; X32-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32			; X32-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
	; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0			; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
	; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X32-NEXT: ret i32 [[CONV]]			; X32-NEXT: ret i32 [[CONV]]
	;			;
	; X64_1LD-LABEL: @cmp_eq6(			; X64_1LD-LABEL: @cmp_eq6(
	; X64_1LD-NEXT: br label [[LOADBB:%.*]]			; X64_1LD-NEXT: br label [[LOADBB:%.*]]
	; X64_1LD: res_block:			; X64_1LD: res_block:
	; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]]			; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]]
	; X64_1LD: loadbb:			; X64_1LD: loadbb:
	; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32			; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32
	; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32			; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32
	; X64_1LD-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]			; X64_1LD-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]
	; X64_1LD-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]			; X64_1LD-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]
	; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]			; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
	; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]			; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
	; X64_1LD: loadbb1:			; X64_1LD: loadbb1:
	; X64_1LD-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i16*			; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 4
	; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i16*			; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i16*
	; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i16, i16 [[TMP6]], i16 2			; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 4
	; X64_1LD-NEXT: [[TMP9:%.]] = getelementptr i16, i16 [[TMP7]], i16 2			; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i16*
	; X64_1LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP8]]			; X64_1LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP7]]
	; X64_1LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]			; X64_1LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]
	; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]			; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
	; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]			; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
	; X64_1LD: endblock:			; X64_1LD: endblock:
	; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]			; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
	; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0			; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
	; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X64_1LD-NEXT: ret i32 [[CONV]]			; X64_1LD-NEXT: ret i32 [[CONV]]
	;			;
	; X64_2LD-LABEL: @cmp_eq6(			; X64_2LD-LABEL: @cmp_eq6(
	; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32			; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32
	; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32			; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32
	; X64_2LD-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]			; X64_2LD-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]
	; X64_2LD-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]			; X64_2LD-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]
	; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]			; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
	; X64_2LD-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i16*			; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 4
	; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i16*			; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i16*
	; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i16, i16 [[TMP6]], i16 2			; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 4
	; X64_2LD-NEXT: [[TMP9:%.]] = getelementptr i16, i16 [[TMP7]], i16 2			; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i16*
	; X64_2LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP8]]			; X64_2LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP7]]
	; X64_2LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]			; X64_2LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]
	; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32			; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
	; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32			; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
	; X64_2LD-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]			; X64_2LD-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
	; X64_2LD-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]			; X64_2LD-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
	; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0			; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
	; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32			; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
	; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0			; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
	; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X64_2LD-NEXT: ret i32 [[CONV]]			; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; ALL-LABEL: @cmp_eq7(			; X32-LABEL: @cmp_eq7(
	; ALL-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 7)			; X32-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32
	; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0			; X32-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32
	; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]
	; ALL-NEXT: ret i32 [[CONV]]			; X32-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]
				; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
				; X32-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 3
				; X32-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i32*
				; X32-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 3
				; X32-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i32*
				; X32-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP7]]
				; X32-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]
				; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
				; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
				; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
				; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
				; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
				; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X32-NEXT: ret i32 [[CONV]]
				;
				; X64_1LD-LABEL: @cmp_eq7(
				; X64_1LD-NEXT: br label [[LOADBB:%.*]]
				; X64_1LD: res_block:
				; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]]
				; X64_1LD: loadbb:
				; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32
				; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32
				; X64_1LD-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]
				; X64_1LD-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]
				; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
				; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
				; X64_1LD: loadbb1:
				; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 3
				; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i32*
				; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 3
				; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i32*
				; X64_1LD-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP7]]
				; X64_1LD-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]
				; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
				; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
				; X64_1LD: endblock:
				; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
				; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
				; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_1LD-NEXT: ret i32 [[CONV]]
				;
				; X64_2LD-LABEL: @cmp_eq7(
				; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32
				; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32
				; X64_2LD-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]
				; X64_2LD-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]
				; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
				; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 3
				; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i32*
				; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 3
				; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i32*
				; X64_2LD-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP7]]
				; X64_2LD-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]
				; X64_2LD-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
				; X64_2LD-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
				; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
				; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
				; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
				; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; X32-LABEL: @cmp_eq8(			; X32-LABEL: @cmp_eq8(
	; X32-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32			; X32-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i32
	; X32-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32			; X32-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i32
	; X32-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]			; X32-NEXT: [[TMP3:%.]] = load i32, i32 [[TMP1]]
	; X32-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]			; X32-NEXT: [[TMP4:%.]] = load i32, i32 [[TMP2]]
	; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]			; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
	; X32-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i32*			; X32-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 4
	; X32-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i32*			; X32-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i32*
	; X32-NEXT: [[TMP8:%.]] = getelementptr i32, i32 [[TMP6]], i32 1			; X32-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 4
	; X32-NEXT: [[TMP9:%.]] = getelementptr i32, i32 [[TMP7]], i32 1			; X32-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i32*
	; X32-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP8]]			; X32-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP7]]
	; X32-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]			; X32-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]
	; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]			; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
	; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]			; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
	; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0			; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
	; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32			; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
	; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0			; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
	; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X32-NEXT: ret i32 [[CONV]]			; X32-NEXT: ret i32 [[CONV]]
	▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines
	; X64_1LD: loadbb:			; X64_1LD: loadbb:
	; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64			; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
	; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64			; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
	; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]			; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
	; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]			; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
	; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]			; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
	; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]			; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
	; X64_1LD: loadbb1:			; X64_1LD: loadbb1:
	; X64_1LD-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i16*			; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 8
	; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i16*			; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i16*
	; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i16, i16 [[TMP6]], i16 4			; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 8
	; X64_1LD-NEXT: [[TMP9:%.]] = getelementptr i16, i16 [[TMP7]], i16 4			; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i16*
	; X64_1LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP8]]			; X64_1LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP7]]
	; X64_1LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]			; X64_1LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]
	; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]			; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
	; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]			; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
	; X64_1LD: endblock:			; X64_1LD: endblock:
	; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]			; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
	; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0			; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
	; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X64_1LD-NEXT: ret i32 [[CONV]]			; X64_1LD-NEXT: ret i32 [[CONV]]
	;			;
	; X64_2LD-LABEL: @cmp_eq10(			; X64_2LD-LABEL: @cmp_eq10(
	; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64			; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
	; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64			; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
	; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]			; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
	; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]			; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
	; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]			; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
	; X64_2LD-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i16*			; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 8
	; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i16*			; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i16*
	; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i16, i16 [[TMP6]], i16 4			; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 8
	; X64_2LD-NEXT: [[TMP9:%.]] = getelementptr i16, i16 [[TMP7]], i16 4			; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i16*
	; X64_2LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP8]]			; X64_2LD-NEXT: [[TMP10:%.]] = load i16, i16 [[TMP7]]
	; X64_2LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]			; X64_2LD-NEXT: [[TMP11:%.]] = load i16, i16 [[TMP9]]
	; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i64			; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i64
	; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i64			; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i64
	; X64_2LD-NEXT: [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]			; X64_2LD-NEXT: [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
	; X64_2LD-NEXT: [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]			; X64_2LD-NEXT: [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
	; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0			; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
	; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32			; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
	; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0			; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
	; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X64_2LD-NEXT: ret i32 [[CONV]]			; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; ALL-LABEL: @cmp_eq11(			; X32-LABEL: @cmp_eq11(
	; ALL-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 11)			; X32-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 11)
	; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0			; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
	; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; ALL-NEXT: ret i32 [[CONV]]			; X32-NEXT: ret i32 [[CONV]]
				;
				; X64_1LD-LABEL: @cmp_eq11(
				; X64_1LD-NEXT: br label [[LOADBB:%.*]]
				; X64_1LD: res_block:
				; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]]
				; X64_1LD: loadbb:
				; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
				; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
				; X64_1LD: loadbb1:
				; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 3
				; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 3
				; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_1LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_1LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
				; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
				; X64_1LD: endblock:
				; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
				; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
				; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_1LD-NEXT: ret i32 [[CONV]]
				;
				; X64_2LD-LABEL: @cmp_eq11(
				; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
				; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 3
				; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 3
				; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_2LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_2LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
				; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
				; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
				; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
				; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
				; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	Show All 10 Lines
	; X64_1LD: loadbb:			; X64_1LD: loadbb:
	; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64			; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
	; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64			; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
	; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]			; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
	; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]			; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
	; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]			; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
	; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]			; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
	; X64_1LD: loadbb1:			; X64_1LD: loadbb1:
	; X64_1LD-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i32*			; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 8
	; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i32*			; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i32*
	; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i32, i32 [[TMP6]], i32 2			; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 8
	; X64_1LD-NEXT: [[TMP9:%.]] = getelementptr i32, i32 [[TMP7]], i32 2			; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i32*
	; X64_1LD-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP8]]			; X64_1LD-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP7]]
	; X64_1LD-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]			; X64_1LD-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]
	; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]			; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
	; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]			; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
	; X64_1LD: endblock:			; X64_1LD: endblock:
	; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]			; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
	; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0			; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
	; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X64_1LD-NEXT: ret i32 [[CONV]]			; X64_1LD-NEXT: ret i32 [[CONV]]
	;			;
	; X64_2LD-LABEL: @cmp_eq12(			; X64_2LD-LABEL: @cmp_eq12(
	; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64			; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
	; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64			; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
	; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]			; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
	; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]			; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
	; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]			; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
	; X64_2LD-NEXT: [[TMP6:%.]] = bitcast i8 [[X]] to i32*			; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 8
	; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[Y]] to i32*			; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i32*
	; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i32, i32 [[TMP6]], i32 2			; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 8
	; X64_2LD-NEXT: [[TMP9:%.]] = getelementptr i32, i32 [[TMP7]], i32 2			; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i32*
	; X64_2LD-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP8]]			; X64_2LD-NEXT: [[TMP10:%.]] = load i32, i32 [[TMP7]]
	; X64_2LD-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]			; X64_2LD-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP9]]
	; X64_2LD-NEXT: [[TMP12:%.*]] = zext i32 [[TMP10]] to i64			; X64_2LD-NEXT: [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
	; X64_2LD-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64			; X64_2LD-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
	; X64_2LD-NEXT: [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]			; X64_2LD-NEXT: [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
	; X64_2LD-NEXT: [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]			; X64_2LD-NEXT: [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
	; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0			; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
	; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32			; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
	; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0			; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
	; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X64_2LD-NEXT: ret i32 [[CONV]]			; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; ALL-LABEL: @cmp_eq13(			; X32-LABEL: @cmp_eq13(
	; ALL-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 13)			; X32-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 13)
	; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0			; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
	; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; ALL-NEXT: ret i32 [[CONV]]			; X32-NEXT: ret i32 [[CONV]]
				;
				; X64_1LD-LABEL: @cmp_eq13(
				; X64_1LD-NEXT: br label [[LOADBB:%.*]]
				; X64_1LD: res_block:
				; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]]
				; X64_1LD: loadbb:
				; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
				; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
				; X64_1LD: loadbb1:
				; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 5
				; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 5
				; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_1LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_1LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
				; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
				; X64_1LD: endblock:
				; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
				; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
				; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_1LD-NEXT: ret i32 [[CONV]]
				;
				; X64_2LD-LABEL: @cmp_eq13(
				; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
				; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 5
				; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 5
				; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_2LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_2LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
				; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
				; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
				; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
				; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
				; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; ALL-LABEL: @cmp_eq14(			; X32-LABEL: @cmp_eq14(
	; ALL-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 14)			; X32-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 14)
	; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0			; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
	; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; ALL-NEXT: ret i32 [[CONV]]			; X32-NEXT: ret i32 [[CONV]]
				;
				; X64_1LD-LABEL: @cmp_eq14(
				; X64_1LD-NEXT: br label [[LOADBB:%.*]]
				; X64_1LD: res_block:
				; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]]
				; X64_1LD: loadbb:
				; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
				; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
				; X64_1LD: loadbb1:
				; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 6
				; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 6
				; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_1LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_1LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
				; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
				; X64_1LD: endblock:
				; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
				; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
				; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_1LD-NEXT: ret i32 [[CONV]]
				;
				; X64_2LD-LABEL: @cmp_eq14(
				; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
				; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 6
				; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 6
				; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_2LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_2LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
				; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
				; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
				; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
				; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
				; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; ALL-LABEL: @cmp_eq15(			; X32-LABEL: @cmp_eq15(
	; ALL-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 15)			; X32-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 15)
	; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0			; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
	; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; ALL-NEXT: ret i32 [[CONV]]			; X32-NEXT: ret i32 [[CONV]]
				;
				; X64_1LD-LABEL: @cmp_eq15(
				; X64_1LD-NEXT: br label [[LOADBB:%.*]]
				; X64_1LD: res_block:
				; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]]
				; X64_1LD: loadbb:
				; X64_1LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_1LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_1LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_1LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
				; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]
				; X64_1LD: loadbb1:
				; X64_1LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 7
				; X64_1LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_1LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 7
				; X64_1LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_1LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_1LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
				; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
				; X64_1LD: endblock:
				; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
				; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
				; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_1LD-NEXT: ret i32 [[CONV]]
				;
				; X64_2LD-LABEL: @cmp_eq15(
				; X64_2LD-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i64
				; X64_2LD-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i64
				; X64_2LD-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]
				; X64_2LD-NEXT: [[TMP4:%.]] = load i64, i64 [[TMP2]]
				; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
				; X64_2LD-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[X]], i8 7
				; X64_2LD-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to i64*
				; X64_2LD-NEXT: [[TMP8:%.]] = getelementptr i8, i8 [[Y]], i8 7
				; X64_2LD-NEXT: [[TMP9:%.]] = bitcast i8 [[TMP8]] to i64*
				; X64_2LD-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP7]]
				; X64_2LD-NEXT: [[TMP11:%.]] = load i64, i64 [[TMP9]]
				; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
				; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
				; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
				; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
				; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
				; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
				; X64_2LD-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

	define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	Show All 23 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 178185

include/llvm/Analysis/TargetTransformInfo.h

lib/CodeGen/ExpandMemCmp.cpp

lib/Target/X86/X86TargetTransformInfo.cpp

test/CodeGen/X86/memcmp-optsize.ll

test/CodeGen/X86/memcmp.ll

test/Transforms/ExpandMemCmp/X86/memcmp.ll

[CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads.
ClosedPublic