This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/test/
-
test/
-
CodeGen/NVPTX/
-
NVPTX/
-
LoadStoreVectorizer.ll
-
MachineSink-call.ll
-
MachineSink-convergent.ll
-
TailDuplication-convergent.ll
-
access-non-generic.ll
-
add-128bit.ll
-
addrspacecast-gvar.ll
-
addrspacecast.ll
-
aggr-param.ll
-
aggregate-return.ll
-
annotations.ll
-
arg-lowering.ll
-
arithmetic-fp-sm20.ll
-
arithmetic-int.ll
-
async-copy.ll
-
atomics-sm60.ll
-
atomics-with-scope.ll
-
atomics.ll
-
b52037.ll
-
barrier.ll
-
bfe.ll
-
branch-fold.ll
-
bug17709.ll
-
bug21465.ll
-
bug22246.ll
-
bug22322.ll
-
bug26185-2.ll
-
bug26185.ll
1
bug41651.ll
-
bypass-div.ll
-
call-with-alloca-buffer.ll
-
callchain.ll
-
calling-conv.ll
-
calls-with-phi.ll
-
combine-min-max.ll
-
compare-int.ll
-
constant-vectors.ll
-
convert-fp.ll
-
convert-int-sm20.ll
-
convert-sm80.ll
-
ctlz.ll
-
ctpop.ll
-
cttz.ll
-
disable-opt.ll
-
div-ri.ll
-
divrem-combine.ll
-
envreg.ll
-
extloadv.ll
-
f16-ex2.ll
-
f16-instructions.ll
-
f16x2-instructions.ll
-
fast-math.ll
-
fma-assoc.ll
-
fma-disable.ll
-
fma.ll
-
fminimum-fmaximum.ll
-
fns.ll
-
fp-contract.ll
-
fp-literals.ll
-
fp16.ll
-
function-align.ll
-
generic-to-nvvm.ll
-
global-addrspace.ll
-
global-ordering.ll
-
global-variable-big.ll
-
global-visibility.ll
-
globals_init.ll
-
globals_lowering.ll
-
half.ll
-
i1-global.ll
-
i1-int-to-fp.ll
-
i1-param.ll
-
i128-global.ll
-
i128-param.ll
-
i128-retval.ll
-
i128-struct.ll
-
i8-param.ll
-
idioms.ll
-
imad.ll
-
inline-asm.ll
-
inlineasm-output-template.ll
-
intrinsic-old.ll
-
intrinsics.ll
-
isspacep.ll
-
ld-addrspace.ll
-
ld-generic.ll
-
ld-st-addrrspace.py
-
ldg-invariant.ll
-
ldparam-v4.ll
-
ldu-i8.ll
-
ldu-ldg.ll
-
ldu-reg-plus-offset.ll
-
libcall-fulfilled.ll
-
load-sext-i1.ll
-
load-store.ll
-
load-with-non-coherent-cache.ll
-
local-stack-frame.ll
-
lower-aggr-copies.ll
-
lower-alloca.ll
-
lower-args.ll
-
lower-byval-args.ll
-
lower-kernel-ptr-arg.ll
-
machine-sink.ll
-
managed.ll
-
match.ll
-
math-intrins-sm53-ptx42.ll
-
math-intrins-sm80-ptx70.ll
-
math-intrins-sm86-ptx72.ll
-
math-intrins.ll
-
mbarrier.ll
-
minmax-negative.ll
-
misaligned-vector-ldst.ll
-
module-inline-asm.ll
-
mulwide.ll
-
named-barriers.ll
-
no-extra-parens.ll
-
nofunc.ll
-
nounroll.ll
-
nvcl-param-align.ll
-
nvvm-annotations-D120129.ll
-
param-align.ll
-
param-load-store.ll
-
param-vectorize-device.ll
-
param-vectorize-kernel.ll
-
pow2_mask_cmp.ll
-
pr13291-i1-store.ll
-
pr16278.ll
-
pr17529.ll
-
read-global-variable-constant.ll
-
redux-sync.ll
-
refl1.ll
-
reg-copy.ll
-
reg-types.ll
-
rotate.ll
-
rotate_64.ll
-
sched1.ll
-
sched2.ll
-
sext-in-reg.ll
-
sext-params.ll
-
shfl-p.ll
-
shfl-sync-p.ll
-
shfl-sync.ll
-
shfl.ll
-
shift-parts.ll
-
simple-call.ll
-
sqrt-approx.ll
-
st-addrspace.ll
-
st-generic.ll
-
store-retval.ll
-
surf-read-cuda.ll
-
surf-read.ll
-
surf-tex.py
-
surf-write-cuda.ll
-
surf-write.ll
-
symbol-naming.ll
-
tex-read-cuda.ll
-
tex-read.ll
-
texsurf-queries.ll
-
tid-range.ll
-
tuple-literal.ll
-
vec-param-load.ll
-
vec8.ll
-
vector-args.ll
-
vector-call.ll
-
vector-compare.ll
-
vector-global.ll
-
vector-loads.ll
-
vector-select.ll
-
vector-stores.ll
-
vectorize-misaligned.ll
-
vote.ll
-
weak-global.ll
-
weak-linkage.ll
-
wmma.py
-
zeroext-32bit.ll
-
DebugInfo/NVPTX/
-
NVPTX/
-
crash-missing-DISubprogram.ll
-
cu-range-hole.ll
-
dbg-declare-alloca.ll
-
dbg-value-const-byref.ll
-
debug-addr-class.ll
-
debug-empty.ll
-
debug-file-loc-only.ll
-
debug-file-loc.ll
-
debug-info.ll
-
debug-loc-offset.ll
-
debug-name-table.ll
-
dwarf-file-dir.ll
-
packed_bitfields.ll
-
lit.cfg.py
-
lit.site.cfg.py.in

Differential D121727

[NVPTX] Integrate ptxas to LIT tests
ClosedPublic

Authored by asavonic on Mar 15 2022, 12:03 PM.

Download Raw Diff

Details

Reviewers

tra
jholewinski
krisb

Commits

rG0f1b5f115a7f: [NVPTX] Integrate ptxas to LIT tests

Summary

ptxas is a proprietary compiler from Nvidia that can compile PTX to
machine code (SASS). It has a lot of diagnostics to catch errors
in PTX, which can be used to verify PTX output from llc.

Set -DPXTAS_EXECUTABLE=/path/to/ptxas CMake option to enable it.
If this option is not set, then ptxas is substituted to true which
effectively disables all ptxas RUN lines.

LLVM_PTXAS_EXECUTABLE environment variable takes precedence over
the CMake option, and allows to override ptxas executable that is used for LIT
without complete re-configuration.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

asavonic created this revision.Mar 15 2022, 12:03 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 15 2022, 12:03 PM

asavonic requested review of this revision.Mar 15 2022, 12:03 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 15 2022, 12:03 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

This patch is here to start a discussion and get feedback.
Once we agree on the implementation I will update all other LIT tests in the same way.

Substituting true may not always work. It's OK if we only need pass/fail result from ptxas, but we may want to check ptxas output, too.

Can we make ptxas a feature instead? The tests that need it would use REQUIRES: ptxas and would be guaranteed to have ptxas available.
It may also need to be versioned, too. Some tests may require recent enough version of ptxas in order to work (e.g. we may want to use it to test new instructions that an older ptxas may not understand).

In D121727#3383453, @tra wrote:

Substituting true may not always work. It's OK if we only need pass/fail result from ptxas, but we may want to check ptxas output, too.

Can we make ptxas a feature instead? The tests that need it would use REQUIRES: ptxas and would be guaranteed to have ptxas available.
It may also need to be versioned, too. Some tests may require recent enough version of ptxas in order to work (e.g. we may want to use it to test new instructions that an older ptxas may not understand).

We should probably have both: substitution to true works for *all* existing tests where can use ptxas as a sanity check, and a feature allows us to write tests for machine code.

Harbormaster completed remote builds in B154398: Diff 415531.Mar 15 2022, 1:10 PM

In D121727#3383523, @asavonic wrote:

We should probably have both: substitution to true works for *all* existing tests where can use ptxas as a sanity check, and a feature allows us to write tests for machine code.

Wouldn't it be redundant? If REQUIRES: ptxas is satisfied, there's no need for using true as a substitute. If it's not satisfied, then we would not run such test and therefore true would not be useful either.

Using ptxas in a test w/o REQUIRES: ptxas would be a user error, IMO as the test would be relying on something that's not expected to be available by default and would not do anything useful.
I would actually make this assertion even stronger -- allowing a test to run and return a success when we in fact didn't test anything would be a wrong thing to do. We should correctly report such test as not executed.

In D121727#3383955, @tra wrote:

In D121727#3383523, @asavonic wrote:

We should probably have both: substitution to true works for *all* existing tests where can use ptxas as a sanity check, and a feature allows us to write tests for machine code.

Wouldn't it be redundant? If REQUIRES: ptxas is satisfied, there's no need for using true as a substitute. If it's not satisfied, then we would not run such test and therefore true would not be useful either.

Using ptxas in a test w/o REQUIRES: ptxas would be a user error, IMO as the test would be relying on something that's not expected to be available by default and would not do anything useful.
I would actually make this assertion even stronger -- allowing a test to run and return a success when we in fact didn't test anything would be a wrong thing to do. We should correctly report such test as not executed.

ptxas in a test without REQUIRES: ptxas can still work as an optional verification step, similar to opt -verify or llc -verify-machineinstrs.
Yes, if ptxas is not available then these checks will do nothing, but it should be fine as long as there is a way to enable them. We can also setup a buildbot to run with ptxas.
The advantage here is that we can add these checks to all existing LIT tests and make sure that they indeed produce valid PTX code.

For tests that require ptxas to function - REQUIRES: ptxas is definitely needed.

If we want to distinguish these two cases, then we can have two substitutions: ptxas-verify that expands to ptxas -c -o /dev/null (or to true if ptxas is not available), and just ptxas that requires the corresponding feature.

In D121727#3386172, @asavonic wrote:

The advantage here is that we can add these checks to all existing LIT tests and make sure that they indeed produce valid PTX code.

Without ptxas available those tests would not do anything useful, would they? Why not just use REQUIRES: ptxas and avoid adding this special-case substitution of ptxas with true.
I really don't see it buying us anything. There are no existing ptxas tests and adding REQUIRES: ptxas to the new ones sounds like the right thing to do.

For tests that require ptxas to function - REQUIRES: ptxas is definitely needed.

If we want to distinguish these two cases, then we can have two substitutions: ptxas-verify that expands to ptxas -c -o /dev/null (or to true if ptxas is not available), and just ptxas that requires the corresponding feature.

I'm not convinced we need those two cases. I would prefer to keep things simple. If there's no ptxas available -> don't run ptxas tests.
As a user I would be unpoleasantly surprised to see a test supposedly launching ptxas passing in my sandbox (because it actually happened ran 'true', without my knowledge), yet LLVM producing invalid PTX.
I would much rather know upfront that PTX has not been tested with ptxas, rather than risking a false positive if ptxas was not enabled.

@echristo - WDYT? Do we pretend to run any other external tools that we use in tests if they are not available?

Added ptxasand ptxas-X.Y features.
Added ptxas-verify substitution that gets replaced to either ptxas or true.

In D121727#3386851, @tra wrote:

In D121727#3386172, @asavonic wrote:

The advantage here is that we can add these checks to all existing LIT tests and make sure that they indeed produce valid PTX code.

Without ptxas available those tests would not do anything useful, would they? Why not just use REQUIRES: ptxas and avoid adding this special-case substitution of ptxas with true.
I really don't see it buying us anything. There are no existing ptxas tests and adding REQUIRES: ptxas to the new ones sounds like the right thing to do.

The idea is to add ptxas RUN lines to existing ~200 LIT tests (see access-non-generic.ll as an example). Since we cannot add REQUIRES: ptxas to all of them, we can make these new checks optional.

For tests that require ptxas to function - REQUIRES: ptxas is definitely needed.

If we want to distinguish these two cases, then we can have two substitutions: ptxas-verify that expands to ptxas -c -o /dev/null (or to true if ptxas is not available), and just ptxas that requires the corresponding feature.

I'm not convinced we need those two cases. I would prefer to keep things simple. If there's no ptxas available -> don't run ptxas tests.
As a user I would be unpoleasantly surprised to see a test supposedly launching ptxas passing in my sandbox (because it actually happened ran 'true', without my knowledge), yet LLVM producing invalid PTX.
I would much rather know upfront that PTX has not been tested with ptxas, rather than risking a false positive if ptxas was not enabled.

Yes, I agree. Unfortunately, I don't see any other way to reuse the existing tests and RUN lines.
If PXTAS_EXECUTABLE is set, nothing should be ignored.

Harbormaster completed remote builds in B154671: Diff 415934.Mar 16 2022, 1:15 PM

In D121727#3387119, @asavonic wrote:

The idea is to add ptxas RUN lines to existing ~200 LIT tests (see access-non-generic.ll as an example). Since we cannot add REQUIRES: ptxas to all of them, we can make these new checks optional.

Ah. I see. Being able to verify that the generated PTX produced by existing tests does get assembled would be useful.

... Unfortunately, I don't see any other way to reuse the existing tests and RUN lines.

Ideally we'd want lit to provide a way to make some RUN lines conditional on available features, so we could run more tests when we can, without having to rewrite the file.
And we don't have anything like that.

That said, I'm still not quite happy about substituting it with 'true'.
Perhaps we can do it slightly differently:

do not run RUN lines with ptxas in them unless the feature is available, and
document this auto-disabled behavior of ptxas in LLVM's lit docs.

This should give us more consistent behavior. E.g. if someone writes a test checking ptxas output with FileCheck, it would still work consistently. Replacing ptxas with true would cause the test to fail in this scenario, because the RUN line would still be executed and FileCheck would expect to see the output.
It's not quite like predicating RUN lines based on available features, but it would be close enough to be usable in this case. Combined with documentation that may do.

In D121727#3387211, @tra wrote:

Ideally we'd want lit to provide a way to make some RUN lines conditional on available features, so we could run more tests when we can, without having to rewrite the file.
And we don't have anything like that.

That said, I'm still not quite happy about substituting it with 'true'.
Perhaps we can do it slightly differently:

do not run RUN lines with ptxas in them unless the feature is available, and

document this auto-disabled behavior of ptxas in LLVM's lit docs.

This should give us more consistent behavior. E.g. if someone writes a test checking ptxas output with FileCheck, it would still work consistently. Replacing ptxas with true would cause the test to fail in this scenario, because the RUN line would still be executed and FileCheck would expect to see the output.
It's not quite like predicating RUN lines based on available features, but it would be close enough to be usable in this case. Combined with documentation that may do.

Ok, so I made a draft implementation of conditional execution of RUN commands. We can now decide what approach is better:

RUN: ptxas-verify %t-nvptx.ptx
RUN: %(when ptxas ptxas -c %t-nvptx.ptx -o /dev/null)

The first one is substituted to true if ptxasexecutable is not available. It does not expose output machine code, so there is no issues with a pipelined FileCheck invocation.
The second one is a new feature of LIT: %(when cond command) is expanded to command if cond feature is available.

If we decide to proceed with the second approach, I'll finalize and submit it as a separate patch.

Added a draft for %(when cond command) LIT syntax.

Herald added a subscriber: delcypher. · View Herald TranscriptMar 17 2022, 1:39 PM

In D121727#3390353, @asavonic wrote:

Added a draft for %(when cond command) LIT syntax.

RUN: %(when ptxas ptxas -c %t-nvptx.ptx -o /dev/null)

That may work. I'd change the syntax a bit to separate the condition check from command itself.
E.g. something like RUN: %when(ptxas) ptxas -c %t-nvptx.ptx -o /dev/null

Perhaps we can just expose available features via environment variables and then let shell do the work.
E.g. set LIT_HAS_PTXAS=1 and then use RUN: ((LIT_HAS_PTXAS)) && ptxas command

The downside of this approach is that it assumes that we do run tests using shell, which may not always be the case (on Windows?).

Another approach would be to use RUN-<feature>: to mark conditional lines. Or RUNIF:feature1,feature2:

I'm just thinking out loud. Some sort of %when predicate would work for me, but I'm not the owner of lit. Speaking of owners, we should add someone familiar to the review.
Summoning @mstorsjo -- WDYT?

Harbormaster completed remote builds in B154911: Diff 416283.Mar 17 2022, 2:49 PM

In D121727#3390620, @tra wrote:

In D121727#3390353, @asavonic wrote:

Added a draft for %(when cond command) LIT syntax.

RUN: %(when ptxas ptxas -c %t-nvptx.ptx -o /dev/null)

That may work. I'd change the syntax a bit to separate the condition check from command itself.
E.g. something like RUN: %when(ptxas) ptxas -c %t-nvptx.ptx -o /dev/null

Perhaps we can just expose available features via environment variables and then let shell do the work.
E.g. set LIT_HAS_PTXAS=1 and then use RUN: ((LIT_HAS_PTXAS)) && ptxas command

The downside of this approach is that it assumes that we do run tests using shell, which may not always be the case (on Windows?).

Tests can either be executed by the lit internal shell, or by shelling out to Windows. Within the llvm/clang testsuites, the lit internal shell is used by default on Windows, but not e.g. in the libcxx testsuite: https://github.com/llvm/llvm-project/blob/main/llvm/utils/lit/lit/llvm/config.py#L25-L42

The lit internal shell can handle a fair bit of expressions, but I'm not sure if it handles this feature.

Another approach would be to use RUN-<feature>: to mark conditional lines. Or RUNIF:feature1,feature2:

I'm just thinking out loud. Some sort of %when predicate would work for me, but I'm not the owner of lit. Speaking of owners, we should add someone familiar to the review.
Summoning @mstorsjo -- WDYT?

Not much opinions on the suggested ways of implementing this. I think @jdenny is code owner (or the closest thing) for lit, I think he'd have more input on the direction here.

In D121727#3390706, @mstorsjo wrote:

I think @jdenny is code owner (or the closest thing) for lit,

Not sure I am. :-)

I think he'd have more input on the direction here.

Yes, I can offer an opinion. Thanks for tagging me.

I'm reluctant to add a new lit construct (%when or RUNIF:) based on only one use case (or are there more I missed?) when an existing construct (lit substitution) would work easily. A second or third use case might reveal more generality is needed. I'm also slightly concerned the proposed %when or RUNIF: might generally encourage people to use it when REQUIRES is the right thing to do (but I understand REQUIRES is not right for this use case).

If you go with a new construct, here are some additional capabilities that occur to me:

A block syntax would be nice so you don't have to repeat the condition across multiple RUN lines and don't have to string together commands with &&. Given that, a one-line syntax might not even be worthwhile.
Besides features, it might support lit substitutions within the condition. Some test suites (I'm thinking of libomptarget and some of my downstream work) instantiate the same lit test and the same lit RUN lines multiple times in different sub test suites for different configurations. If those configurations are reflected in lit substitutions, then the RUN lines could vary across instantiations to some degree based on the condition in a %when or RUNIF.
Ultimately, I imagine this would become a general preprocessor #if directive for lit. Maybe name it %if?
An additional operator syntax could sometimes be useful for changing parts of a command: RUN: FileCheck -DVAR=%if(cond, then, else).

But do we want to go down this road just yet? Maybe it needs to bake a bit longer? The lit substitution is simple and fits your current use case well, in my opinion.

If you go with the lit substitution, I have a few small comments. I like the proposal to discard its output. That makes it harder to misuse it, and it leads people back to REQUIRES in some cases. The substitution could be named to more clearly indicate it might not run: %optional-ptxas-verify is one possibility. When not available, it could expand to an explicit message instead of true to help people understand what's happening when debugging in lit's verbose mode. We already use the : command to report RUN line numbers, so that could be used here. The message might be "Skipping verification because ptxas is not available".

I've added a few other subscribers that have participated in lit discussions in the past.

In D121727#3392558, @jdenny wrote:

I'm reluctant to add a new lit construct (%when or RUNIF:) based on only one use case (or are there more I missed?)

It's a use case where we currently can not easily verify the assembly we produce for the whole back-end (NVPTX). When we add new instructions, we have to test the output manually. The end result is that we've already had a handful of bugs that were detected only after they showed up long after commit and that has non-0trivial logistical consequences. E.g. if it makes it into a compiler release, it's hard to get all the users to update their compiler even if we do make a patch release with the fix.

Making things easily testable would provide an important safeguard.

when an existing construct (lit substitution) would work easily.

Could you elaborate? Do you mean the substitution of %ptxas with true if it's not available? Something else?

A second or third use case might reveal more generality is needed. I'm also slightly concerned the proposed %when or RUNIF: might generally encourage people to use it when REQUIRES is the right thing to do (but I understand REQUIRES is not right for this use case).

Those were just strawman-style proposals. I don't have particular attachment to any of them.
My main concern was that tests should not lie to the user. One should be able to tell a test that was not run from the test that ran and succeeded.

But do we want to go down this [ complicated ] road just yet? Maybe it needs to bake a bit longer? The lit substitution is simple and fits your current use case well, in my opinion.

SGTM, modulo "no-lies" concern I've outlined above.

If you go with the lit substitution, I have a few small comments. I like the proposal to discard its output.

Which output do you have in mind?

If we go with substitution, I think the sensible thing to do would be to discard the whole RUN line if the substitution fails. Unlike substituting %ptxas with true, it would not work with RUN likes that depend on ptxas output. E.g. %ptxas | FileCheck.

That makes it harder to misuse it, and it leads people back to REQUIRES in some cases. The substitution could be named to more clearly indicate it might not run: %optional-ptxas-verify is one possibility. When not available, it could expand to an explicit message instead of true to help people understand what's happening when debugging in lit's verbose mode.

What if we introduce a pseudo-tool %if-<feature>. If the feature is available it would be substituted by an empty string and would not affect the result of the substitution of the RUN line. If the substitution fails, it would disable the RUN line and we could issue a diagnostic when run in verbose mode. It would be easy to tell what's expected to happen in the test source and, as a bonus, would allow predicating RUN lines based on multiple features.

E.g. RUN: %if-ptxas %ptxas whatever.

This would also be useful when we need to do some back-end specific tests in a largely generic test file. E.g. when we're testing generic LLVM intrinsics we may want to verify that they produce particular instruction for a given back-end, but we don't know whether that back-end is compiled in. Right now we have to copy/paste such tests per-back-end and use REQUIRES.

We already use the : command to report RUN line numbers, so that could be used here. The message might be "Skipping verification because ptxas is not available".

I'm not familiar with that. Could you give me an example of how/where this is used?

What I've seen done in other cases is to write a second test file that uses the first test file as input. This lets the second test file have different requirements than the first. In the example here, you wouldn't modify access-non-generic.ll but rather add a second file, say access-non-generic-ptxas.ll that contained just this:

REQUIRES: ptxas
RUN: llc < access-non-generic.ll -march=nvptx -mcpu=sm_20 -o %t.ptx
RUN: ptxas -c %t.ptx -o /dev/null

The upside is, this uses existing lit mechanisms (feature test based on discovery of "ptxas" on PATH, or some environment variable is set).
The downside is, a bit more effort to keep the two files consistent; although in this case, it looks like the "ptxas" commands are pretty simple so I shouldn't think it would be a real burden.

In D121727#3392944, @tra wrote:

when an existing construct (lit substitution) would work easily.

Could you elaborate? Do you mean the substitution of %ptxas with true if it's not available? Something else?

Yes, I was referring to the %ptxas-verify proposal mentioned earlier.

A second or third use case might reveal more generality is needed. I'm also slightly concerned the proposed %when or RUNIF: might generally encourage people to use it when REQUIRES is the right thing to do (but I understand REQUIRES is not right for this use case).

Those were just strawman-style proposals. I don't have particular attachment to any of them.
My main concern was that tests should not lie to the user. One should be able to tell a test that was not run from the test that ran and succeeded.

I'm not seeing how %when or RUNIF: accomplishes that particular goal better than %ptxas-verify. Either way, the RUN line runs when ptxas is available and quietly doesn't run when it's not available. Am I misunderstanding?

If you go with the lit substitution, I have a few small comments. I like the proposal to discard its output.

Which output do you have in mind?

I was referring to the ptxas -c -o /dev/null proposal mentioned earlier, but now I realize that only helps with file output. More below.

If we go with substitution, I think the sensible thing to do would be to discard the whole RUN line if the substitution fails. Unlike substituting %ptxas with true, it would not work with RUN likes that depend on ptxas output. E.g. %ptxas | FileCheck.

So the problem is someone might check ptxas diagnostic output with FileCheck? That seems like a bad idea even when you're sure ptxas is available given that it's an external tool whose diagnostics can change over time. At most, it seems like tests would be written to fail for a non-zero exit status and possibly a non-empty stderr, and that should work fine when expanding to true.

What if we introduce a pseudo-tool %if-<feature>. If the feature is available it would be substituted by an empty string and would not affect the result of the substitution of the RUN line. If the substitution fails, it would disable the RUN line and we could issue a diagnostic when run in verbose mode. It would be easy to tell what's expected to happen in the test source and, as a bonus, would allow predicating RUN lines based on multiple features.

E.g. RUN: %if-ptxas %ptxas whatever.

In comparison to the %when, RUNIF:, %if, etc. proposals, is this just a new syntax proposal? Or is there another difference to consider?

This would also be useful when we need to do some back-end specific tests in a largely generic test file. E.g. when we're testing generic LLVM intrinsics we may want to verify that they produce particular instruction for a given back-end, but we don't know whether that back-end is compiled in. Right now we have to copy/paste such tests per-back-end and use REQUIRES.

It sounds like we're getting into more use cases now, so that makes a new construct more appealing.

We already use the : command to report RUN line numbers, so that could be used here. The message might be "Skipping verification because ptxas is not available".

I'm not familiar with that. Could you give me an example of how/where this is used?

Failed lit tests have output like the following so that, if there are many RUN lines, it's easier to tell which RUN line failed:

$ ":" "RUN: at line 1"

You might need LIT_OPTS=-vv to make it useful, depending on your config.

Another use case for some sort of "if X" construct (or RUNIF:X etc) is tests that have slightly different behaviours on Windows and Linux. One such example that already exists is in tools\llvm-objdump\X86\source-interleave-prefix-non-windows.test and its Windows equivalent tools\llvm-objdump\X86\source-interleave-prefix-windows.test. It would be great to be able to have the same test, but the behaviour to differ slightly for the specific checks. Using the RUNIF format as an example, it might look like:

RUN: ... commands to generate some output ... > output.txt
RUNIF:windows: FileCheck %s --input=output.txt --check-prefix=WINDOWS
RUNIF:!windows: FileCheck %s --input=output.txt --check-prefix=NON-WINDOWS

Of course, this above has a lot of duplicated code, so it might be nice to make the if an if/else ternary-style. Spitball idea:

RUN: ... | FileCheck %s --check-prefix={%if windows {WINDOWS} else {NON-WINDOWS}}

The specific case here could become:

RUN: {%if ptxas {ptxas some-args | FileCheck %s} else true}

Ideally, we'd even allow empty RUN lines, if a substitution on the line evaluates to an empty string, so that the else true isn't necessary. Bonus points would allow this if to span multiple lines. Maybe this:

RUN: {%if feature { \
RUN:   command1; \
RUN:   command2 | \
RUN:   command3}}

In D121727#3393335, @jhenderson wrote:
Of course, this above has a lot of duplicated code, so it might be nice to make the if an if/else ternary-style. Spitball idea:
RUN: ... | FileCheck %s --check-prefix={%if windows {WINDOWS} else {NON-WINDOWS}}

I like this direction as it seems more flexible. Why do we need the outermost braces?

Bonus points would allow this if to span multiple lines. Maybe this:
RUN: {%if feature { \
RUN:   command1; \
RUN:   command2 | \
RUN:   command3}}

Might be nice if the \ is not required so it's possible to enclose multiple existing RUN lines without having to join them with ;, &&, etc.

In D121727#3393386, @jdenny wrote:
In D121727#3393335, @jhenderson wrote:
Of course, this above has a lot of duplicated code, so it might be nice to make the if an if/else ternary-style. Spitball idea:
RUN: ... | FileCheck %s --check-prefix={%if windows {WINDOWS} else {NON-WINDOWS}}
I like this direction as it seems more flexible. Why do we need the outermost braces?

Because the else might be optional, so it clearly delineates the if/else block, as opposed to something using the term "else" immediately after the if part. I'm open to other suggestions on this though.

Bonus points would allow this if to span multiple lines. Maybe this:
RUN: {%if feature { \
RUN:   command1; \
RUN:   command2 | \
RUN:   command3}}
Might be nice if the \ is not required so it's possible to enclose multiple existing RUN lines without having to join them with ;, &&, etc.

Agreed, though I don't have a clear view of how this might work, since the RUN lines are done pre-subsititution (I think, not actually checked), and I see this "if" style as a kind of conditionalised substitution.

In D121727#3393179, @jdenny wrote:

In D121727#3392944, @tra wrote:

I'm not seeing how %when or RUNIF: accomplishes that particular goal better than %ptxas-verify. Either way, the RUN line runs when ptxas is available and quietly doesn't run when it's not available. Am I misunderstanding?

The difference is that replacing the commend itself with true still executes the RUN line.

I believe that the whole "RUN" line must be handled atomically. Either all commands are executed as specified by user, or none of them, if we can't provide any of the substitutions.
We should not magically replace a command with true, execute the RUN line and expect it not to fail for all possible commands users may put on the RUN line.
lit can control whether it runs the RUN line or not, but it has no control what the user expected that command to do.
Sure, we can provide a restricted variant of the tool substitution which would explicitly require RUN line not to depend on command's outputs, only on its exit status.
That would be unnecessarily restrictive, IMO, but we can live with that. Still, I think executing or not whole RUN line will give us more flexibility and consistency.

If we go with substitution, I think the sensible thing to do would be to discard the whole RUN line if the substitution fails. Unlike substituting %ptxas with true, it would not work with RUN lines that depend on ptxas output. E.g. %ptxas | FileCheck.

So the problem is someone might check ptxas diagnostic output with FileCheck?

It's just one possibility. In general, it's applicable to any output the program may produce, not just what it may print on the console. It's also not specific to ptxas either. In general it may be applicable to any optionally available external tool. ptxas just happened to be the first one where the benefit was worth the trouble implementing this patch.
Even in case of ptxas, I can see how we may conceivably want to examine the ELF binary it produces. Fir instance, we may want to verify that it's got correct ELF sections in it, or examine dwarf data, etc. Maybe disassemble the code (we'd need another optional tool for that)

That seems like a bad idea even when you're sure ptxas is available given that it's an external tool whose diagnostics can change over time.
At most, it seems like tests would be written to fail for a non-zero exit status and possibly a non-empty stderr, and that should work fine when expanding to true.

True, to an extent, but even potentially unstable tool output is better than nothing. I can think of using the output of ptxas -v to catch at least one kind of regressions we've ran into in the past.

I think we should figure out how such optional tools are supposed to work in lit and leave it up to users what they want to do with it. We don't know whether relying on external tool output is a good idea or not for all use cases.

What if we introduce a pseudo-tool %if-<feature>. If the feature is available it would be substituted by an empty string and would not affect the result of the substitution of the RUN line. If the substitution fails, it would disable the RUN line and we could issue a diagnostic when run in verbose mode. It would be easy to tell what's expected to happen in the test source and, as a bonus, would allow predicating RUN lines based on multiple features.

E.g. RUN: %if-ptxas %ptxas whatever.

In comparison to the %when, RUNIF:, %if, etc. proposals, is this just a new syntax proposal? Or is there another difference to consider?

Mostly syntax. It's yet another way to express "don't run if the feature is not available", as opposed to "substitute command with true and run".

Failed lit tests have output like the following so that, if there are many RUN lines, it's easier to tell which RUN line failed:
$ ":" "RUN: at line 1"
You might need LIT_OPTS=-vv to make it useful, depending on your config.

Thank you. This is good to know.

In D121727#3393453, @jhenderson wrote:
In D121727#3393386, @jdenny wrote:
In D121727#3393335, @jhenderson wrote:
Of course, this above has a lot of duplicated code, so it might be nice to make the if an if/else ternary-style. Spitball idea:
RUN: ... | FileCheck %s --check-prefix={%if windows {WINDOWS} else {NON-WINDOWS}}
I like this direction as it seems more flexible. Why do we need the outermost braces?
Because the else might be optional, so it clearly delineates the if/else block, as opposed to something using the term "else" immediately after the if part. I'm open to other suggestions on this though.

I think else -> %else should fix the parsing issue.

Bonus points would allow this if to span multiple lines. Maybe this:
RUN: {%if feature { \
RUN:   command1; \
RUN:   command2 | \
RUN:   command3}}
Might be nice if the \ is not required so it's possible to enclose multiple existing RUN lines without having to join them with ;, &&, etc.
Agreed, though I don't have a clear view of how this might work, since the RUN lines are done pre-subsititution (I think, not actually checked), and I see this "if" style as a kind of conditionalised substitution.

In my downstream work, I managed to implement a lit %for data {...} syntax that unrolls its body's RUN lines before handing things to the shell. It does not require \. Based on that, we can probably get %if without \ to work somehow.

In D121727#3392973, @probinson wrote:
What I've seen done in other cases is to write a second test file that uses the first test file as input. This lets the second test file have different requirements than the first. In the example here, you wouldn't modify access-non-generic.ll but rather add a second file, say access-non-generic-ptxas.ll that contained just this:
REQUIRES: ptxas
RUN: llc < access-non-generic.ll -march=nvptx -mcpu=sm_20 -o %t.ptx
RUN: ptxas -c %t.ptx -o /dev/null
The upside is, this uses existing lit mechanisms (feature test based on discovery of "ptxas" on PATH, or some environment variable is set).
The downside is, a bit more effort to keep the two files consistent; although in this case, it looks like the "ptxas" commands are pretty simple so I shouldn't think it would be a real burden.

As you've mentioned, we'll need to do it for all the 200 or so tests we potentially want to use ptxas on.

In D121727#3393481, @tra wrote:

I believe that the whole "RUN" line must be handled atomically.

At this point, multiple use cases beyond the specific ptxas verification use case that started this thread are arising, so it sounds like a new construct that's general enough to handle them all is warranted. For that general case, I agree with your arguments along the above lines. Thanks for explaining your position.

I can think of using the output of ptxas -v to catch at least one kind of regressions we've ran into in the past.

Thanks. I wasn't aware of that use case.

ychen added a subscriber: ychen.Mar 23 2022, 12:36 PM

asavonic mentioned this in D121299: [NVPTX] Disable DWARF .file directory for PTX.Mar 24 2022, 11:50 AM

asavonic mentioned this in D122569: [lit] Support %if ... %else syntax for RUN lines.Mar 28 2022, 4:08 AM

Published the LIT patch to: https://reviews.llvm.org/D122569 [lit] Support %if ... %else syntax for RUN lines

While D122569 is waiting for feedback or approval, I've submitted these patches to fix minor issues that were found during ptxas integration:

D123038 [NVPTX] 64-bit atom.{and,or,xor,min,max} require sm_32 or higher
D123039 [NVPTX] shfl.sync is introduced in PTX 6.0
D123040 [NVPTX] .attribute(.managed) is only supported for sm_30 and PTX 4.0
D123041 [NVPTX] Avoid dots in global names

In D121727#3426524, @asavonic wrote:

While D122569 is waiting for feedback or approval, I've submitted these patches to fix minor issues that were found during ptxas integration:

D123038 [NVPTX] 64-bit atom.{and,or,xor,min,max} require sm_32 or higher

D123039 [NVPTX] shfl.sync is introduced in PTX 6.0

D123040 [NVPTX] .attribute(.managed) is only supported for sm_30 and PTX 4.0

D123041 [NVPTX] Avoid dots in global names

Thank you for the clean-ups.
You may want to arrange the patches into a stack of related revisions (see phabricator menu on the right side of patch description).

asavonic added child revisions: D123038: [NVPTX] 64-bit atom.{and,or,xor,min,max} require sm_32 or higher, D123039: [NVPTX] shfl.sync is introduced in PTX 6.0, D123040: [NVPTX] .attribute(.managed) is only supported for sm_30 and PTX 4.0, D123041: [NVPTX] Avoid dots in global names, D123470: [NVPTX] Fix barrier.ll LIT test.Apr 11 2022, 8:52 AM

asavonic added a child revision: D122569: [lit] Support %if ... %else syntax for RUN lines.Apr 11 2022, 8:56 AM

asavonic added a child revision: D124108: [NVPTX] Fix LIT tests with default nameTableKind.Apr 20 2022, 10:30 AM

Herald added subscribers: mattd, gchakrabarti. · View Herald TranscriptApr 20 2022, 10:30 AM

Added LLVM_PTXAS_EXECUTABLE environment variable to simplify testing against different ptxas versions.
Rebased on top of D122569.
Integrated ptxas to all tests in CodeGen/NVPTX and DebugInfo/NVPTX.

Herald added a subscriber: zzheng. · View Herald TranscriptApr 20 2022, 12:18 PM

Harbormaster completed remote builds in B160505: Diff 423984.Apr 20 2022, 12:18 PM

tra added inline comments.Apr 20 2022, 12:24 PM

llvm/test/CodeGen/NVPTX/bug41651.ll
2	Here and everywhere -- I'd run conditional ptxas testing as the last step. We should make sure that the actual tests have succeeded, first. No point running ptxas on the output that we already know is wrong.

I've tested the patch against every ptxas release from 9.0 to the latest 11.6. Version 9.0 is the earliest version that I was able to install on my Debian oldstable machine, so it seems like a good minimal version that we can support. Also, it is worth noting that ptxas is not completely backwards compatible: newer version can drop old SM versions, so we sometimes need to account for this. For example:

; sm_30 is dropped by ptxas 11.0
; RUN: %if ptxas %{ %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %t.ptx %}

One disadvantage of this patch is that we can no longer pipe llc output to FileCheck: we have to use -o option with a temporary file. This may break update_llc_checks.py, but I haven't confirmed this yet.
We can use tee to avoid this, but it is not portable. Perhaps we can write a simple version of tee and add it to LLVM tools.

In D121727#3462870, @asavonic wrote:

One disadvantage of this patch is that we can no longer pipe llc output to FileCheck: we have to use -o option with a temporary file. This may break update_llc_checks.py, but I haven't confirmed this yet.

We don't have to use a temporary file, I think. ptxas can get its own input from its own llc invocation
echo -e '.version 3.2\n.target sm_20\n.address_size 64' | $HOME/local/cuda-11.2/bin/ptxas - works for me.

In D121727#3462903, @tra wrote:

In D121727#3462870, @asavonic wrote:

One disadvantage of this patch is that we can no longer pipe llc output to FileCheck: we have to use -o option with a temporary file. This may break update_llc_checks.py, but I haven't confirmed this yet.

We don't have to use a temporary file, I think. ptxas can get its own input from its own llc invocation
echo -e '.version 3.2\n.target sm_20\n.address_size 64' | $HOME/local/cuda-11.2/bin/ptxas - works for me.

Right, but then we can't pipe it to file check. Something like this should work, but it is not portable:

llc ... | tee %t.ptx | FileCheck
ptxas %.ptx

In D121727#3462915, @asavonic wrote:
Right, but then we can't pipe it to file check. Something like this should work, but it is not portable:
llc ... | tee %t.ptx | FileCheck
ptxas %.ptx

What stops us doing it the same way we do for all the test cases? E.g.

RUN: llc ... %s |  FileCheck
RUN: %if ptxas %{ llc... %s | ptxas - %}

In D121727#3462941, @tra wrote:
In D121727#3462915, @asavonic wrote:
Right, but then we can't pipe it to file check. Something like this should work, but it is not portable:
llc ... | tee %t.ptx | FileCheck
ptxas %.ptx
What stops us doing it the same way we do for all the test cases? E.g.
RUN: llc ... %s |  FileCheck
RUN: %if ptxas %{ llc... %s | ptxas - %}

Well, this will probably double execution time for all tests. Options for llc have to be duplicated as well and kept in sync.

In D121727#3462983, @asavonic wrote:

Well, this will probably double execution time for all tests. Options for llc have to be duplicated as well and kept in sync.

It's not any different compared to using llc or any other tool on any other RUN lines. In any case, for most of the users it will be a no-op as ptxas will be disabled for almost everyone.

Reimplemented ptxas RUN lines to use pipes instead of temporary files

Harbormaster completed remote builds in B161166: Diff 424908.Apr 25 2022, 7:57 AM

tra accepted this revision.Apr 25 2022, 11:07 AM

This revision is now accepted and ready to land.Apr 25 2022, 11:07 AM

asavonic mentioned this in rG1041a9642ba0: [lit] Support %if ... %else syntax for RUN lines.Apr 27 2022, 10:30 AM

This revision was landed with ongoing or failed builds.Apr 28 2022, 5:02 AM

Closed by commit rG0f1b5f115a7f: [NVPTX] Integrate ptxas to LIT tests (authored by asavonic). · Explain Why

This revision was automatically updated to reflect the committed changes.

asavonic added a commit: rG0f1b5f115a7f: [NVPTX] Integrate ptxas to LIT tests.

It looks very weird to me to have LLVM's test suite support calling some proprietary tool. I can see how this is useful, but imho it doesn't belong into LLVM itself.

This is both for practical and other reasons.

From a practical point of view, if we rely on this tool for testing in LLVM, it means we can't run LLVM's tests on platforms that that tool doesn't run on.

Independent of practical considerations, LLVM tries to be an open-source, standalone toolchain development suite. Making it depend on proprietary programs, even just optionally and at the test level, works against both these goals.

How do others feel about this?

In D121727#3479832, @thakis wrote:

It looks very weird to me to have LLVM's test suite support calling some proprietary tool. I can see how this is useful, but imho it doesn't belong into LLVM itself.

This is both for practical and other reasons.

From a practical point of view, if we rely on this tool for testing in LLVM, it means we can't run LLVM's tests on platforms that that tool doesn't run on.

It is weird and inconvenient to have a proprietary tool that is used for testing, I agree. This is why this feature is completely optional, and mainly intended to be used by buildbots or by people who contribute to NVPTX regularly.
If I understand correctly, PTX is only defined by the ISA manual, which is sometimes incomplete or unclear, so ptxas is used as a reference implementation. In fact, requests to test code generated by llc with ptxas (such as [1]) were frequent enough (and helpful as well) for me to justify making this patch set. Before this integration I had to dump llc output, run ptxas with correct options for each test, and do this for every test that I could break.

[1]: https://reviews.llvm.org/D114367#3147361

Independent of practical considerations, LLVM tries to be an open-source, standalone toolchain development suite. Making it depend on proprietary programs, even just optionally and at the test level, works against both these goals.

I personally wouldn't call it a dependency. If there is a free software tool that can compile PTX assembly and verify it (I'm not aware of any) - it should be fairly easy to support it as well.
In any case, ptxas is just a tool used for development. It is the same story as with static analyzers: yes, many of them are proprietary, but if some people are willing to run them - they can help to make free software better.

I agree, with caveats, that in general LLVM should not depend on proprietary tools.
I'm also potentially biased as I'm one of the primary beneficiaries of these changes.

It's unfortunate that we both do want to verify that LLVM produces valid output and are unable to do so
without ptxas, as it is the ultimate consumer of the LLVM output and is the source of the ground truth when it comes to the validity of PTX syntax.

Without this patch, the only choice we have is to land NVPTX back-end changes with limited testing for what *we* think is a valid PTX.
ptxas does not always agree. In order to verify correctness of the PTX we do need to feed the generated PTX to ptxas.
Right now it has to be done manually. I frequently ask contributors to do that during code reviews and it did allow us to spot some errors early.
We've also discovered a handful of old errors by running the tests introduced by this patch.

We do have bots running CUDA test-suite, but they are mostly concerned with front-end being functional for
various combinations of GPU, CUDA version, and the standard C++ library used.
These tests are not particularly well-suited for verifying syntax of wide range of instructions we generate.

We could've set up yet another CPU-only buildbot for running ptxas compilation tests.
I actually do plan to do that, and having this patch allows doing that relatively easily.
That would still be removed from the LLVM commits that introduce the changes in PTX output.
Developers would still have to do their own tests manually and we would still need some sort of solution for that.

This feels like (maybe is?) supporting a target whose assembler happens not to be generally distributed, but which LLVM still supports as a target. For those environments/bots that actually have the assembler installed, having that extra level of validation seems worthwhile.

But perhaps this discussion should be taking place on discourse, not in a review?

In D121727#3481635, @probinson wrote:

This feels like (maybe is?) supporting a target whose assembler happens not to be generally distributed, but which LLVM still supports as a target.
For those environments/bots that actually have the assembler installed, having that extra level of validation seems worthwhile.

That is exactly the case.

But perhaps this discussion should be taking place on discourse, not in a review?

Sure. If @thakis has further concerns or questions, I'll be happy to continue the discussion there.

I think having a short thread on discourse might be a good idea.

If we had used the strategy here in the clang-cl project, I feel we would've ended up in a much less useful place. Now we have (cross-platform!) PDB writers and dumpers and whatnot in llvm. If we had (optionally) relied on proprietary tools like this here does, that wouldn't have happened.

In D121727#3491089, @thakis wrote:

I think having a short thread on discourse might be a good idea.

https://discourse.llvm.org/t/opinions-on-conditional-dependencies-on-proprietary-tools/62236

Revision Contents

Path

Size

llvm/

test/

CodeGen/

NVPTX/

LoadStoreVectorizer.ll

3 lines

MachineSink-call.ll

2 lines

MachineSink-convergent.ll

2 lines

TailDuplication-convergent.ll

1 line

access-non-generic.ll

2 lines

add-128bit.ll

1 line

addrspacecast-gvar.ll

1 line

3 lines

1 line

1 line

2 lines

1 line

arithmetic-fp-sm20.ll

2 lines

arithmetic-int.ll

2 lines

async-copy.ll

4 lines

atomics-sm60.ll

2 lines

atomics-with-scope.ll

2 lines

1 line

1 line

1 line

1 line

2 lines

1 line

1 line

1 line

1 line

1 line

1 line

2 lines

1 line

call-with-alloca-buffer.ll

1 line

1 line

2 lines

2 lines

1 line

2 lines

1 line

2 lines

2 lines

1 line

1 line

1 line

1 line

1 line

1 line

2 lines

1 line

1 line

1 line

21 lines

f16x2-instructions.ll

16 lines

1 line

2 lines

4 lines

1 line

2 lines

1 line

2 lines

1 line

1 line

1 line

1 line

2 lines

2 lines

global-variable-big.ll

2 lines

1 line

1 line

1 line

1 line

1 line

1 line

1 line

1 line

1 line

1 line

1 line

1 line

2 lines

1 line

2 lines

inlineasm-output-template.ll

1 line

2 lines

2 lines

1 line

3 lines

2 lines

2 lines

1 line

1 line

1 line

1 line

ldu-reg-plus-offset.ll

1 line

libcall-fulfilled.ll

2 lines

load-sext-i1.ll

1 line

load-store.ll

1 line

load-with-non-coherent-cache.ll

2 lines

2 lines

1 line

1 line

1 line

2 lines

lower-kernel-ptr-arg.ll

1 line

machine-sink.ll

1 line

managed.ll

1 line

match.ll

1 line

math-intrins-sm53-ptx42.ll

1 line

math-intrins-sm80-ptx70.ll

1 line

math-intrins-sm86-ptx72.ll

1 line

math-intrins.ll

4 lines

mbarrier.ll

2 lines

minmax-negative.ll

1 line

misaligned-vector-ldst.ll

1 line

1 line

2 lines

2 lines

1 line

2 lines

1 line

1 line

nvvm-annotations-D120129.ll

1 line

param-align.ll

1 line

param-load-store.ll

1 line

param-vectorize-device.ll

1 line

param-vectorize-kernel.ll

1 line

1 line

2 lines

2 lines

1 line

read-global-variable-constant.ll

1 line

1 line

1 line

1 line

2 lines

2 lines

2 lines

1 line

1 line

1 line

1 line

1 line

1 line

1 line

1 line

1 line

4 lines

4 lines

4 lines

3 lines

1 line

2 lines

1 line

2 lines

2 lines

1 line

2 lines

2 lines

1 line

2 lines

2 lines

2 lines

1 line

1 line

2 lines

1 line

4 lines

1 line

1 line

4 lines

1 line

vectorize-misaligned.ll

2 lines

1 line

1 line

1 line

28 lines

1 line

DebugInfo/

NVPTX/

crash-missing-DISubprogram.ll

3 lines

cu-range-hole.ll

1 line

dbg-declare-alloca.ll

1 line

dbg-value-const-byref.ll

2 lines

debug-addr-class.ll

1 line

debug-empty.ll

1 line

debug-file-loc-only.ll

1 line

1 line

1 line

1 line

1 line

3 lines

1 line

47 lines

1 line

Diff 425748

llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll

	; RUN: llc < %s \| FileCheck -check-prefix=ENABLED %s			; RUN: llc < %s \| FileCheck -check-prefix=ENABLED %s
	; RUN: llc -disable-nvptx-load-store-vectorizer < %s \| FileCheck -check-prefix=DISABLED %s			; RUN: llc -disable-nvptx-load-store-vectorizer < %s \| FileCheck -check-prefix=DISABLED %s
				; RUN: %if ptxas %{ llc < %s \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc -disable-nvptx-load-store-vectorizer < %s \| %ptxas-verify %}

	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; Check that the load-store vectorizer is enabled by default for nvptx, and			; Check that the load-store vectorizer is enabled by default for nvptx, and
	; that it's disabled by the appropriate flag.			; that it's disabled by the appropriate flag.

	; ENABLED: ld.v2.{{.}}32			; ENABLED: ld.v2.{{.}}32
	; DISABLED: ld.{{.}}32			; DISABLED: ld.{{.}}32
	; DISABLED: ld.{{.}}32			; DISABLED: ld.{{.}}32
	▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/MachineSink-call.ll

	; RUN: llc < %s \| FileCheck %s			; RUN: llc < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s \| %ptxas-verify %}

	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	declare void @foo()			declare void @foo()

	; Load a value, then call a function. Branch, and use the loaded value only on			; Load a value, then call a function. Branch, and use the loaded value only on
	; one side of the branch. The load shouldn't be sunk beneath the call, because			; one side of the branch. The load shouldn't be sunk beneath the call, because
	; the call may modify memory.			; the call may modify memory.
	define i32 @f(i32 %x, i32* %ptr, i1 %cond) {			define i32 @f(i32 %x, i32* %ptr, i1 %cond) {
	Show All 14 Lines

llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll

	; RUN: llc < %s \| FileCheck %s			; RUN: llc < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s \| %ptxas-verify %}

	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	declare void @llvm.nvvm.barrier0()			declare void @llvm.nvvm.barrier0()

	; Load a value, then syncthreads. Branch, and use the loaded value only on one			; Load a value, then syncthreads. Branch, and use the loaded value only on one
	; side of the branch. The load shouldn't be sunk beneath the call, because			; side of the branch. The load shouldn't be sunk beneath the call, because
	; syncthreads is modeled as maystore.			; syncthreads is modeled as maystore.
	define i32 @f(i32 %x, i32* %ptr, i1 %cond) {			define i32 @f(i32 %x, i32* %ptr, i1 %cond) {
	Show All 14 Lines

llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll

	; RUN: llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s \| FileCheck %s			; RUN: llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s \| %ptxas-verify %}
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	declare void @foo()			declare void @foo()
	declare void @llvm.nvvm.barrier0()			declare void @llvm.nvvm.barrier0()

	; syncthreads shouldn't be duplicated.			; syncthreads shouldn't be duplicated.
	; CHECK: .func call_syncthreads			; CHECK: .func call_syncthreads
	; CHECK: bar.sync			; CHECK: bar.sync
	Show All 36 Lines

llvm/test/CodeGen/NVPTX/access-non-generic.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix PTX			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix PTX
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix PTX			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix PTX
	; RUN: opt -mtriple=nvptx-- < %s -S -infer-address-spaces \| FileCheck %s --check-prefix IR			; RUN: opt -mtriple=nvptx-- < %s -S -infer-address-spaces \| FileCheck %s --check-prefix IR
	; RUN: opt -mtriple=nvptx64-- < %s -S -infer-address-spaces \| FileCheck %s --check-prefix IR			; RUN: opt -mtriple=nvptx64-- < %s -S -infer-address-spaces \| FileCheck %s --check-prefix IR
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4			@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
	@scalar = internal addrspace(3) global float 0.000000e+00, align 4			@scalar = internal addrspace(3) global float 0.000000e+00, align 4

	; Verifies nvptx-favor-non-generic correctly optimizes generic address space			; Verifies nvptx-favor-non-generic correctly optimizes generic address space
	; usage to non-generic address space usage for the patterns we claim to handle:			; usage to non-generic address space usage for the patterns we claim to handle:
	; 1. load cast			; 1. load cast
	; 2. store cast			; 2. store cast
	▲ Show 20 Lines • Show All 168 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/add-128bit.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"



	define void @foo(i64 %a, i64 %add, i128* %retptr) {			define void @foo(i64 %a, i64 %add, i128* %retptr) {
	; CHECK: add.s64			; CHECK: add.s64
	; CHECK: setp.lt.u64			; CHECK: setp.lt.u64
	Show All 10 Lines

llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: .visible .global .align 4 .u32 g = 42;			; CHECK: .visible .global .align 4 .u32 g = 42;
	; CHECK: .visible .global .align 4 .u32 g2 = generic(g);			; CHECK: .visible .global .align 4 .u32 g2 = generic(g);
	; CHECK: .visible .global .align 4 .u32 g3 = g;			; CHECK: .visible .global .align 4 .u32 g3 = g;
	; CHECK: .visible .global .align 8 .u32 g4[2] = {0, generic(g)};			; CHECK: .visible .global .align 8 .u32 g4[2] = {0, generic(g)};
	; CHECK: .visible .global .align 8 .u32 g5[2] = {0, generic(g)+8};			; CHECK: .visible .global .align 8 .u32 g5[2] = {0, generic(g)+8};

	@g = addrspace(1) global i32 42			@g = addrspace(1) global i32 42
	@g2 = addrspace(1) global i32* addrspacecast (i32 addrspace(1)* @g to i32*)			@g2 = addrspace(1) global i32* addrspacecast (i32 addrspace(1)* @g to i32*)
	@g3 = addrspace(1) global i32 addrspace(1)* @g			@g3 = addrspace(1) global i32 addrspace(1)* @g
	@g4 = constant {i32, i32} {i32* null, i32* addrspacecast (i32 addrspace(1)* @g to i32*)}			@g4 = constant {i32, i32} {i32* null, i32* addrspacecast (i32 addrspace(1)* @g to i32*)}
	@g5 = constant {i32, i32} {i32* null, i32* addrspacecast (i32 addrspace(1)* getelementptr (i32, i32 addrspace(1)* @g, i32 2) to i32*)}			@g5 = constant {i32, i32} {i32* null, i32* addrspacecast (i32 addrspace(1)* getelementptr (i32, i32 addrspace(1)* @g, i32 2) to i32*)}

llvm/test/CodeGen/NVPTX/addrspacecast.ll

	; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s -check-prefixes=ALL,CLS32,G32			; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s -check-prefixes=ALL,CLS32,G32
	; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64			; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64
	; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr\| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64			; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr\| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64
				; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr \| %ptxas-verify %}

	; ALL-LABEL: conv1			; ALL-LABEL: conv1
	define i32 @conv1(i32 addrspace(1)* %ptr) {			define i32 @conv1(i32 addrspace(1)* %ptr) {
	; G32: cvta.global.u32			; G32: cvta.global.u32
	; ALL-NOT: cvt.u64.u32			; ALL-NOT: cvt.u64.u32
	; G64: cvta.global.u64			; G64: cvta.global.u64
	; ALL: ld.u32			; ALL: ld.u32
	%genptr = addrspacecast i32 addrspace(1)* %ptr to i32*			%genptr = addrspacecast i32 addrspace(1)* %ptr to i32*
	▲ Show 20 Lines • Show All 86 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/aggr-param.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; Make sure aggregate param types get emitted properly.			; Make sure aggregate param types get emitted properly.

	%struct.float4 = type { float, float, float, float }			%struct.float4 = type { float, float, float, float }

	; CHECK: .visible .func bar			; CHECK: .visible .func bar
	; CHECK: .param .align 4 .b8 bar_param_0[16]			; CHECK: .param .align 4 .b8 bar_param_0[16]
	define void @bar(%struct.float4 %f) {			define void @bar(%struct.float4 %f) {
	Show All 11 Lines

llvm/test/CodeGen/NVPTX/aggregate-return.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	declare <2 x float> @barv(<2 x float> %input)			declare <2 x float> @barv(<2 x float> %input)
	declare <3 x float> @barv3(<3 x float> %input)			declare <3 x float> @barv3(<3 x float> %input)
	declare [2 x float] @bara([2 x float] %input)			declare [2 x float] @bara([2 x float] %input)
	declare {float, float} @bars({float, float} %input)			declare {float, float} @bars({float, float} %input)

	define void @test_v2f32(<2 x float> %input, <2 x float>* %output) {			define void @test_v2f32(<2 x float> %input, <2 x float>* %output) {
	; CHECK-LABEL: @test_v2f32			; CHECK-LABEL: @test_v2f32
	▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/annotations.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	@texture = internal addrspace(1) global i64 0, align 8			@texture = internal addrspace(1) global i64 0, align 8
	; CHECK: .global .texref texture			; CHECK: .global .texref texture
	@surface = internal addrspace(1) global i64 0, align 8			@surface = internal addrspace(1) global i64 0, align 8
	; CHECK: .global .surfref surface			; CHECK: .global .surfref surface

	; CHECK: .entry kernel_func_maxntid			; CHECK: .entry kernel_func_maxntid
	define void @kernel_func_maxntid(float* %a) {			define void @kernel_func_maxntid(float* %a) {
	▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/arg-lowering.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: .visible .func (.param .align 16 .b8 func_retval0[16]) foo0(			; CHECK: .visible .func (.param .align 16 .b8 func_retval0[16]) foo0(
	; CHECK: .param .align 4 .b8 foo0_param_0[8]			; CHECK: .param .align 4 .b8 foo0_param_0[8]
	define <4 x float> @foo0({float, float} %arg0) {			define <4 x float> @foo0({float, float} %arg0) {
	ret <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>			ret <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>
	}			}

	; CHECK: .visible .func (.param .align 8 .b8 func_retval0[8]) foo1(			; CHECK: .visible .func (.param .align 8 .b8 func_retval0[8]) foo1(
	; CHECK: .param .align 8 .b8 foo1_param_0[16]			; CHECK: .param .align 8 .b8 foo1_param_0[16]
	define <2 x float> @foo1({float, float, i64} %arg0) {			define <2 x float> @foo1({float, float, i64} %arg0) {
	ret <2 x float> <float 1.0, float 1.0>			ret <2 x float> <float 1.0, float 1.0>
	}			}

llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast \| %ptxas-verify %}

	;; These tests should run for all targets			;; These tests should run for all targets

	;;===-- Basic instruction selection tests ---------------------------------===;;			;;===-- Basic instruction selection tests ---------------------------------===;;


	;;; f64			;;; f64

	▲ Show 20 Lines • Show All 62 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/arithmetic-int.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	;; These tests should run for all targets			;; These tests should run for all targets

	;;===-- Basic instruction selection tests ---------------------------------===;;			;;===-- Basic instruction selection tests ---------------------------------===;;


	;;; i64			;;; i64

	▲ Show 20 Lines • Show All 307 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/async-copy.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 \| FileCheck -check-prefixes=ALL,CHECK_PTX32 %s			; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 \| FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck -check-prefixes=ALL,CHECK_PTX64 %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 \| %ptxas-verify -arch=sm_80 %}
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| %ptxas-verify -arch=sm_80 %}

	declare void @llvm.nvvm.cp.async.wait.group(i32)			declare void @llvm.nvvm.cp.async.wait.group(i32)

	; ALL-LABEL: asyncwaitgroup			; ALL-LABEL: asyncwaitgroup
	define void @asyncwaitgroup() {			define void @asyncwaitgroup() {
	; ALL: cp.async.wait_group 8;			; ALL: cp.async.wait_group 8;
	tail call void @llvm.nvvm.cp.async.wait.group(i32 8)			tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
	; ALL: cp.async.wait_group 0;			; ALL: cp.async.wait_group 0;
	▲ Show 20 Lines • Show All 100 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/atomics-sm60.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_60 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_60 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 \| %ptxas-verify -arch=sm_60 %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 \| %ptxas-verify -arch=sm_60 %}

	; CHECK-LABEL: .func test(			; CHECK-LABEL: .func test(
	define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) {			define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) {
	; CHECK: atom.add.f64			; CHECK: atom.add.f64
	%r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d)			%r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d)
	; CHECK: atom.global.add.f64			; CHECK: atom.global.add.f64
	%r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d)			%r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d)
	; CHECK: atom.shared.add.f64			; CHECK: atom.shared.add.f64
	Show All 20 Lines

llvm/test/CodeGen/NVPTX/atomics-with-scope.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_60 \| FileCheck %s -check-prefixes=CHECK,CHECK32			; RUN: llc < %s -march=nvptx -mcpu=sm_60 \| FileCheck %s -check-prefixes=CHECK,CHECK32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 \| %ptxas-verify -arch=sm_60 %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 \| %ptxas-verify -arch=sm_60 %}

	; CHECK-LABEL: .func test_atomics_scope(			; CHECK-LABEL: .func test_atomics_scope(
	define void @test_atomics_scope(float* %fp, float %f,			define void @test_atomics_scope(float* %fp, float %f,
	double* %dfp, double %df,			double* %dfp, double %df,
	i32* %ip, i32 %i,			i32* %ip, i32 %i,
	i32* %uip, i32 %ui,			i32* %uip, i32 %ui,
	i64* %llp, i64 %ll) #0 {			i64* %llp, i64 %ll) #0 {
	entry:			entry:
	▲ Show 20 Lines • Show All 177 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/atomics.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_32 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_32 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_32 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_32%} %}


	; CHECK-LABEL: atom0			; CHECK-LABEL: atom0
	define i32 @atom0(i32* %addr, i32 %val) {			define i32 @atom0(i32* %addr, i32 %val) {
	; CHECK: atom.add.u32			; CHECK: atom.add.u32
	%ret = atomicrmw add i32* %addr, i32 %val seq_cst			%ret = atomicrmw add i32* %addr, i32 %val seq_cst
	ret i32 %ret			ret i32 %ret
	}			}
	▲ Show 20 Lines • Show All 194 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/b52037.ll

	; Reproducer for a bad performance regression triggered by switch to the new PM.			; Reproducer for a bad performance regression triggered by switch to the new PM.
	; `barney` ended up with the local variables not being optimized away and that			; `barney` ended up with the local variables not being optimized away and that
	; had rather dramatic effect on some GPU code. See			; had rather dramatic effect on some GPU code. See
	; https://bugs.llvm.org/show_bug.cgi?id=52037 for the gory details.			; https://bugs.llvm.org/show_bug.cgi?id=52037 for the gory details.
	;			;
	; RUN: llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s \| FileCheck %s			; RUN: llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s \| %ptxas-verify -arch=sm_70 %}

	; CHECK-LABEL: .visible .entry barney(			; CHECK-LABEL: .visible .entry barney(
	; CHECK-NOT: .local{{.*}}__local_depot			; CHECK-NOT: .local{{.*}}__local_depot
	; CHECK: ret;			; CHECK: ret;

	source_filename = "reduced.1.ll"			source_filename = "reduced.1.ll"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	▲ Show 20 Lines • Show All 236 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/barrier.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	declare void @llvm.nvvm.bar.warp.sync(i32)			declare void @llvm.nvvm.bar.warp.sync(i32)
	declare void @llvm.nvvm.barrier.sync(i32)			declare void @llvm.nvvm.barrier.sync(i32)
	declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)			declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)

	; CHECK-LABEL: .func{{.*}}barrier_sync			; CHECK-LABEL: .func{{.*}}barrier_sync
	define void @barrier_sync(i32 %id, i32 %cnt) {			define void @barrier_sync(i32 %id, i32 %cnt) {
	; CHECK: ld.param.u32 [[ID:%r[0-9]+]], [barrier_sync_param_0];			; CHECK: ld.param.u32 [[ID:%r[0-9]+]], [barrier_sync_param_0];
	Show All 23 Lines

llvm/test/CodeGen/NVPTX/bfe.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}


	; CHECK: bfe0			; CHECK: bfe0
	define i32 @bfe0(i32 %a) {			define i32 @bfe0(i32 %a) {
	; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 4, 4			; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 4, 4
	; CHECK-NOT: shr			; CHECK-NOT: shr
	; CHECK-NOT: and			; CHECK-NOT: and
	%val0 = ashr i32 %a, 4			%val0 = ashr i32 %a, 4
	Show All 23 Lines

llvm/test/CodeGen/NVPTX/branch-fold.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp -verify-machineinstrs \| %ptxas-verify -arch=sm_35 %}

	; Disable CGP which also folds branches, so that only BranchFolding is under			; Disable CGP which also folds branches, so that only BranchFolding is under
	; the spotlight.			; the spotlight.

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	define void @foo(i32 %x, float* %output) {			define void @foo(i32 %x, float* %output) {
	; CHECK-LABEL: .visible .func foo(			; CHECK-LABEL: .visible .func foo(
	Show All 31 Lines

llvm/test/CodeGen/NVPTX/bug17709.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	; ModuleID = '__kernelgen_main_module'			; ModuleID = '__kernelgen_main_module'
	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {			define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
	entry:			entry:
	;unreachable			;unreachable
	Show All 17 Lines

llvm/test/CodeGen/NVPTX/bug21465.ll

	; RUN: opt < %s -nvptx-lower-args -S \| FileCheck %s			; RUN: opt < %s -nvptx-lower-args -S \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s --check-prefix PTX			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s --check-prefix PTX
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-unknown-unknown"			target triple = "nvptx64-unknown-unknown"

	%struct.S = type { i32, i32 }			%struct.S = type { i32, i32 }

	; Function Attrs: nounwind			; Function Attrs: nounwind
	define void @_Z11TakesStruct1SPi(%struct.S* byval(%struct.S) nocapture readonly %input, i32* nocapture %output) #0 {			define void @_Z11TakesStruct1SPi(%struct.S* byval(%struct.S) nocapture readonly %input, i32* nocapture %output) #0 {
	Show All 18 Lines

llvm/test/CodeGen/NVPTX/bug22246.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; CHECK-LABEL: _Z3foobbbPb			; CHECK-LABEL: _Z3foobbbPb
	define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8* nocapture %output) {			define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8* nocapture %output) {
	entry:			entry:
	; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}			; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}
	%.sink.v = select i1 %p1, i1 %p2, i1 %p3			%.sink.v = select i1 %p1, i1 %p2, i1 %p3
	%frombool5 = zext i1 %.sink.v to i8			%frombool5 = zext i1 %.sink.v to i8
	store i8 %frombool5, i8* %output, align 1			store i8 %frombool5, i8* %output, align 1
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/bug22322.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	%class.float3 = type { float, float, float }			%class.float3 = type { float, float, float }

	; Function Attrs: nounwind			; Function Attrs: nounwind
	; CHECK-LABEL: some_kernel			; CHECK-LABEL: some_kernel
	▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/bug26185-2.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	; Verify that we correctly emit code for extending ldg/ldu. We do not expose			; Verify that we correctly emit code for extending ldg/ldu. We do not expose
	; extending variants in the backend, but the ldg/ldu selection code may pick			; extending variants in the backend, but the ldg/ldu selection code may pick
	; extending loads as candidates. We do want to support this, so make sure we			; extending loads as candidates. We do want to support this, so make sure we
	; emit the necessary cvt.* instructions to implement the extension and let ptxas			; emit the necessary cvt.* instructions to implement the extension and let ptxas
	; emit the real extending loads.			; emit the real extending loads.

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	Show All 25 Lines

llvm/test/CodeGen/NVPTX/bug26185.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit			; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit
	; registers in the backend, so these loads need special handling.			; registers in the backend, so these loads need special handling.

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-unknown-unknown"			target triple = "nvptx64-unknown-unknown"

	; CHECK-LABEL: ex_zext			; CHECK-LABEL: ex_zext
	▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/bug41651.ll

	; RUN: llc -filetype=asm -o - %s \| FileCheck %s			; RUN: llc -filetype=asm -o - %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -filetype=asm -o - %s \| %ptxas-verify %}
				traUnsubmitted Not Done Reply Inline Actions Here and everywhere -- I'd run conditional ptxas testing as the last step. We should make sure that the actual tests have succeeded, first. No point running ptxas on the output that we already know is wrong. tra: Here and everywhere -- I'd run conditional ptxas testing as the last step. We should make…

	target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	%func = type { i32 (i32, i32)** }			%func = type { i32 (i32, i32)** }

	; CHECK: foo			; CHECK: foo
	; CHECK: call			; CHECK: call
	; CHECK: ret			; CHECK: ret
	define void @foo() {			define void @foo() {
	%call = call %func undef(i32 0, i32 1)			%call = call %func undef(i32 0, i32 1)
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/bypass-div.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	; 64-bit divides and rems should be split into a fast and slow path where			; 64-bit divides and rems should be split into a fast and slow path where
	; the fast path uses a 32-bit operation.			; the fast path uses a 32-bit operation.

	define void @sdiv64(i64 %a, i64 %b, i64* %retptr) {			define void @sdiv64(i64 %a, i64 %b, i64* %retptr) {
	; CHECK-LABEL: sdiv64(			; CHECK-LABEL: sdiv64(
	; CHECK: div.s64			; CHECK: div.s64
	; CHECK: div.u32			; CHECK: div.u32
	▲ Show 20 Lines • Show All 71 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; Checks how NVPTX lowers alloca buffers and their passing to functions.			; Checks how NVPTX lowers alloca buffers and their passing to functions.
	;			;
	; Produced with the following CUDA code:			; Produced with the following CUDA code:
	; extern "C" __attribute__((device)) void callee(float* f, char* buf);			; extern "C" __attribute__((device)) void callee(float* f, char* buf);
	;			;
	; extern "C" __attribute__((global)) void kernel_func(float* a) {			; extern "C" __attribute__((global)) void kernel_func(float* a) {
	; char buf[4 * sizeof(float)];			; char buf[4 * sizeof(float)];
	▲ Show 20 Lines • Show All 57 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/callchain.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target triple = "nvptx"			target triple = "nvptx"

	define void @foo(i8* %ptr) {			define void @foo(i8* %ptr) {
	%fnptr = bitcast i8* %ptr to void ()*			%fnptr = bitcast i8* %ptr to void ()*
	; CHECK: prototype_0 : .callprototype ()_ ()			; CHECK: prototype_0 : .callprototype ()_ ()
	tail call void %fnptr()			tail call void %fnptr()
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/calling-conv.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}


	;; Kernel function using ptx_kernel calling conv			;; Kernel function using ptx_kernel calling conv

	; CHECK: .entry kernel_func			; CHECK: .entry kernel_func
	define ptx_kernel void @kernel_func(float* %a) {			define ptx_kernel void @kernel_func(float* %a) {
	; CHECK: ret			; CHECK: ret
	ret void			ret void
	Show All 20 Lines

llvm/test/CodeGen/NVPTX/calls-with-phi.ll

	; RUN: llc < %s -march=nvptx 2>&1 \| FileCheck %s			; RUN: llc < %s -march=nvptx 2>&1 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx \| %ptxas-verify %}

	; Make sure the example doesn't crash with segfault			; Make sure the example doesn't crash with segfault

	; CHECK: .visible .func ({{.*}}) loop			; CHECK: .visible .func ({{.*}}) loop
	define i32 @loop(i32, i32) {			define i32 @loop(i32, i32) {
	entry:			entry:
	br label %loop			br label %loop

	loop:			loop:
	Show All 13 Lines

llvm/test/CodeGen/NVPTX/combine-min-max.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O2 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O2 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O2 \| %ptxas-verify %}

	; *************************************			; *************************************
	; * Cases with no min/max			; * Cases with no min/max

	define i32 @ab_eq_i32(i32 %a, i32 %b) {			define i32 @ab_eq_i32(i32 %a, i32 %b) {
	; CHECK-LABEL: @ab_eq_i32			; CHECK-LABEL: @ab_eq_i32
	; CHECK-NOT: min			; CHECK-NOT: min
	; CHECK-NOT: max			; CHECK-NOT: max
	▲ Show 20 Lines • Show All 418 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/compare-int.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	;; These tests should run for all targets			;; These tests should run for all targets

	;;===-- Basic instruction selection tests ---------------------------------===;;			;;===-- Basic instruction selection tests ---------------------------------===;;


	;;; i64			;;; i64

	▲ Show 20 Lines • Show All 377 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/constant-vectors.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target triple = "nvptx-nvidia-cuda"			target triple = "nvptx-nvidia-cuda"

	; CHECK: .visible .global .align 16 .b8 testArray[8] = {0, 1, 2, 3, 4, 5, 6, 7};			; CHECK: .visible .global .align 16 .b8 testArray[8] = {0, 1, 2, 3, 4, 5, 6, 7};
	@testArray = constant [2 x <4 x i8>] [<4 x i8> <i8 0, i8 1, i8 2, i8 3>, <4 x i8> <i8 4, i8 5, i8 6, i8 7>], align 16			@testArray = constant [2 x <4 x i8>] [<4 x i8> <i8 0, i8 1, i8 2, i8 3>, <4 x i8> <i8 4, i8 5, i8 6, i8 7>], align 16

llvm/test/CodeGen/NVPTX/convert-fp.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	define i16 @cvt_u16_f32(float %x) {			define i16 @cvt_u16_f32(float %x) {
	; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}};			; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}};
	; CHECK: ret;			; CHECK: ret;
	%a = fptoui float %x to i16			%a = fptoui float %x to i16
	ret i16 %a			ret i16 %a
	}			}
	define i16 @cvt_u16_f64(double %x) {			define i16 @cvt_u16_f64(double %x) {
	▲ Show 20 Lines • Show All 153 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/convert-int-sm20.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}


	;; Integer conversions happen inplicitly by loading/storing the proper types			;; Integer conversions happen inplicitly by loading/storing the proper types


	; i16			; i16

	define i16 @cvt_i16_i32(i32 %x) {			define i16 @cvt_i16_i32(i32 %x) {
	▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/convert-sm80.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck %s
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| %ptxas-verify -arch=sm_80 %}


	; CHECK-LABEL: cvt_rn_bf16x2_f32			; CHECK-LABEL: cvt_rn_bf16x2_f32
	define i32 @cvt_rn_bf16x2_f32(float %f1, float %f2) {			define i32 @cvt_rn_bf16x2_f32(float %f1, float %f2) {

	; CHECK: cvt.rn.bf16x2.f32			; CHECK: cvt.rn.bf16x2.f32
	%val = call i32 @llvm.nvvm.ff2bf16x2.rn(float %f1, float %f2);			%val = call i32 @llvm.nvvm.ff2bf16x2.rn(float %f1, float %f2);

	▲ Show 20 Lines • Show All 127 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/ctlz.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	declare i16 @llvm.ctlz.i16(i16, i1) readnone			declare i16 @llvm.ctlz.i16(i16, i1) readnone
	declare i32 @llvm.ctlz.i32(i32, i1) readnone			declare i32 @llvm.ctlz.i32(i32, i1) readnone
	declare i64 @llvm.ctlz.i64(i64, i1) readnone			declare i64 @llvm.ctlz.i64(i64, i1) readnone

	; There should be no difference between llvm.ctlz.i32(%a, true) and			; There should be no difference between llvm.ctlz.i32(%a, true) and
	▲ Show 20 Lines • Show All 123 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/ctpop.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	define i32 @myctpop(i32 %a) {			define i32 @myctpop(i32 %a) {
	; CHECK: popc.b32			; CHECK: popc.b32
	%val = tail call i32 @llvm.ctpop.i32(i32 %a)			%val = tail call i32 @llvm.ctpop.i32(i32 %a)
	ret i32 %val			ret i32 %val
	}			}
	Show All 16 Lines

llvm/test/CodeGen/NVPTX/cttz.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	declare i16 @llvm.cttz.i16(i16, i1) readnone			declare i16 @llvm.cttz.i16(i16, i1) readnone
	declare i32 @llvm.cttz.i32(i32, i1) readnone			declare i32 @llvm.cttz.i32(i32, i1) readnone
	declare i64 @llvm.cttz.i64(i64, i1) readnone			declare i64 @llvm.cttz.i64(i64, i1) readnone

	define i32 @myctpop(i32 %a) {			define i32 @myctpop(i32 %a) {
	Show All 35 Lines

llvm/test/CodeGen/NVPTX/disable-opt.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 \| %ptxas-verify %}

	define void @foo(i32* %output) {			define void @foo(i32* %output) {
	; CHECK-LABEL: .visible .func foo(			; CHECK-LABEL: .visible .func foo(
	entry:			entry:
	%local = alloca i32			%local = alloca i32
	; CHECK: __local_depot			; CHECK: __local_depot
	store i32 1, i32* %local			store i32 1, i32* %local
	%0 = load i32, i32* %local			%0 = load i32, i32* %local
	store i32 %0, i32* %output			store i32 %0, i32* %output
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/div-ri.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 \| %ptxas-verify %}

	define float @foo(float %a) {			define float @foo(float %a) {
	; CHECK: div.approx.f32			; CHECK: div.approx.f32
	%div = fdiv float %a, 13.0			%div = fdiv float %a, 13.0
	ret float %div			ret float %div
	}			}

llvm/test/CodeGen/NVPTX/divrem-combine.ll

	; RUN: llc -O2 < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s --check-prefix=O2 --check-prefix=CHECK			; RUN: llc -O2 < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s --check-prefix=O2 --check-prefix=CHECK
	; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s --check-prefix=O0 --check-prefix=CHECK			; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s --check-prefix=O0 --check-prefix=CHECK
				; RUN: %if ptxas %{ llc -O2 < %s -march=nvptx -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}
				; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	; The following IR			; The following IR
	;			;
	; quot = n / d			; quot = n / d
	; rem = n % d			; rem = n % d
	;			;
	; should be transformed into			; should be transformed into
	;			;
	▲ Show 20 Lines • Show All 102 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/envreg.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}


	declare i32 @llvm.nvvm.read.ptx.sreg.envreg0()			declare i32 @llvm.nvvm.read.ptx.sreg.envreg0()
	declare i32 @llvm.nvvm.read.ptx.sreg.envreg1()			declare i32 @llvm.nvvm.read.ptx.sreg.envreg1()
	declare i32 @llvm.nvvm.read.ptx.sreg.envreg2()			declare i32 @llvm.nvvm.read.ptx.sreg.envreg2()
	declare i32 @llvm.nvvm.read.ptx.sreg.envreg3()			declare i32 @llvm.nvvm.read.ptx.sreg.envreg3()
	declare i32 @llvm.nvvm.read.ptx.sreg.envreg4()			declare i32 @llvm.nvvm.read.ptx.sreg.envreg4()
	declare i32 @llvm.nvvm.read.ptx.sreg.envreg5()			declare i32 @llvm.nvvm.read.ptx.sreg.envreg5()
	▲ Show 20 Lines • Show All 130 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/extloadv.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	define void @foo(float* nocapture readonly %x_value, double* nocapture %output) #0 {			define void @foo(float* nocapture readonly %x_value, double* nocapture %output) #0 {
	%1 = bitcast float* %x_value to <4 x float>*			%1 = bitcast float* %x_value to <4 x float>*
	%2 = load <4 x float>, <4 x float>* %1, align 16			%2 = load <4 x float>, <4 x float>* %1, align 16
	%3 = fpext <4 x float> %2 to <4 x double>			%3 = fpext <4 x float> %2 to <4 x double>
	; CHECK-NOT: ld.v2.f32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}];			; CHECK-NOT: ld.v2.f32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}];
	; CHECK: cvt.f64.f32			; CHECK: cvt.f64.f32
	; CHECK: cvt.f64.f32			; CHECK: cvt.f64.f32
	; CHECK: cvt.f64.f32			; CHECK: cvt.f64.f32
	; CHECK: cvt.f64.f32			; CHECK: cvt.f64.f32
	%4 = bitcast double* %output to <4 x double>*			%4 = bitcast double* %output to <4 x double>*
	store <4 x double> %3, <4 x double>* %4			store <4 x double> %3, <4 x double>* %4
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/f16-ex2.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 \| FileCheck %s
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 \| %ptxas-verify -arch=sm_75 %}

	declare half @llvm.nvvm.ex2.approx.f16(half)			declare half @llvm.nvvm.ex2.approx.f16(half)
	declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)			declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)

	; CHECK-LABEL: exp2_half			; CHECK-LABEL: exp2_half
	define half @exp2_half(half %0) {			define half @exp2_half(half %0) {
	; CHECK-NOT: call			; CHECK-NOT: call
	; CHECK: ex2.approx.f16			; CHECK: ex2.approx.f16
	Show All 11 Lines

llvm/test/CodeGen/NVPTX/f16-instructions.ll

	; ## Full FP16 support enabled by default.			; ## Full FP16 support enabled by default.
	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
	; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \			; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
	; RUN: \| FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-F16-NOFTZ %s			; RUN: \| FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-F16-NOFTZ %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
				; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
				; RUN: \| %ptxas-verify -arch=sm_53 \
				; RUN: %}
	; ## Full FP16 with FTZ			; ## Full FP16 with FTZ
	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
	; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \			; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
	; RUN: -denormal-fp-math-f32=preserve-sign \			; RUN: -denormal-fp-math-f32=preserve-sign \
	; RUN: \| FileCheck -check-prefixes CHECK,CHECK-F16-FTZ %s			; RUN: \| FileCheck -check-prefixes CHECK,CHECK-F16-FTZ %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
				; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
				; RUN: -denormal-fp-math-f32=preserve-sign \
				; RUN: \| %ptxas-verify -arch=sm_53 \
				; RUN: %}
	; ## FP16 support explicitly disabled.			; ## FP16 support explicitly disabled.
	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
	; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \			; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
	; RUN: -verify-machineinstrs \			; RUN: -verify-machineinstrs \
	; RUN: \| FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s			; RUN: \| FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
				; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
				; RUN: \| %ptxas-verify -arch=sm_53 \
				; RUN: %}
	; ## FP16 is not supported by hardware.			; ## FP16 is not supported by hardware.
	; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \			; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
	; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \			; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
	; RUN: \| FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s			; RUN: \| FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
				; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
				; RUN: \| %ptxas-verify -arch=sm_52 \
				; RUN: %}

	target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"			target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"

	; CHECK-LABEL: test_ret_const(			; CHECK-LABEL: test_ret_const(
	; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00;			; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00;
	; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];			; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
	; CHECK-NEXT: ret;			; CHECK-NEXT: ret;
	define half @test_ret_const() #0 {			define half @test_ret_const() #0 {
	▲ Show 20 Lines • Show All 1,114 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

	; ## Full FP16 support enabled by default.			; ## Full FP16 support enabled by default.
	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
	; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \			; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
	; RUN: \| FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s			; RUN: \| FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
				; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
				; RUN: \| %ptxas-verify -arch=sm_53 \
				; RUN: %}
	; ## FP16 support explicitly disabled.			; ## FP16 support explicitly disabled.
	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
	; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \			; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
	; RUN: -verify-machineinstrs \			; RUN: -verify-machineinstrs \
	; RUN: \| FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s			; RUN: \| FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
				; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
				; RUN: -verify-machineinstrs \
				; RUN: \| %ptxas-verify -arch=sm_53 \
				; RUN: %}
	; ## FP16 is not supported by hardware.			; ## FP16 is not supported by hardware.
	; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \			; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
	; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \			; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
	; RUN: \| FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s			; RUN: \| FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
				; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
				; RUN: \| %ptxas-verify -arch=sm_52 \
				; RUN: %}

	target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"			target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"

	; CHECK-LABEL: test_ret_const(			; CHECK-LABEL: test_ret_const(
	; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184;			; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184;
	; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]];			; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]];
	; CHECK: st.param.b32 [func_retval0+0], [[R]];			; CHECK: st.param.b32 [func_retval0+0], [[R]];
	; CHECK-NEXT: ret;			; CHECK-NEXT: ret;
	▲ Show 20 Lines • Show All 1,452 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/fast-math.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	declare float @llvm.sqrt.f32(float)			declare float @llvm.sqrt.f32(float)
	declare double @llvm.sqrt.f64(double)			declare double @llvm.sqrt.f64(double)

	; CHECK-LABEL: sqrt_div(			; CHECK-LABEL: sqrt_div(
	; CHECK: sqrt.rn.f32			; CHECK: sqrt.rn.f32
	; CHECK: div.rn.f32			; CHECK: div.rn.f32
	define float @sqrt_div(float %a, float %b) {			define float @sqrt_div(float %a, float %b) {
	▲ Show 20 Lines • Show All 235 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/fma-assoc.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| FileCheck %s -check-prefix=CHECK			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| FileCheck %s -check-prefix=CHECK
	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math \| FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math \| FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math \| %ptxas-verify %}

	define ptx_device float @t1_f32(float %x, float %y, float %z,			define ptx_device float @t1_f32(float %x, float %y, float %z,
	float %u, float %v) {			float %u, float %v) {
	; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};			; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
	; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};			; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
	; CHECK: ret;			; CHECK: ret;
	%a = fmul float %x, %y			%a = fmul float %x, %y
	%b = fmul float %u, %v			%b = fmul float %u, %v
	Show All 29 Lines

llvm/test/CodeGen/NVPTX/fma-disable.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 \| FileCheck %s -check-prefix=FMA			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 \| FileCheck %s -check-prefix=FMA
	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 \| FileCheck %s -check-prefix=MUL			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 \| FileCheck %s -check-prefix=MUL
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 \| FileCheck %s -check-prefix=FMA			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 \| FileCheck %s -check-prefix=FMA
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 \| FileCheck %s -check-prefix=MUL			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 \| FileCheck %s -check-prefix=MUL
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 \| %ptxas-verify %}

	define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {			define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {
	entry:			entry:
	; FMA: fma.rn.f32			; FMA: fma.rn.f32
	; MUL: mul.rn.f32			; MUL: mul.rn.f32
	; MUL: add.rn.f32			; MUL: add.rn.f32
	%a = fmul float %x, %y			%a = fmul float %x, %y
	%b = fadd float %a, %z			%b = fadd float %a, %z
	Show All 12 Lines

llvm/test/CodeGen/NVPTX/fma.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs \| %ptxas-verify %}

	declare float @dummy_f32(float, float) #0			declare float @dummy_f32(float, float) #0
	declare double @dummy_f64(double, double) #0			declare double @dummy_f64(double, double) #0

	define ptx_device float @t1_f32(float %x, float %y, float %z) {			define ptx_device float @t1_f32(float %x, float %y, float %z) {
	; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};			; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
	; CHECK: ret;			; CHECK: ret;
	%a = fmul float %x, %y			%a = fmul float %x, %y
	Show All 33 Lines

llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll

	; RUN: llc < %s -march=nvptx \| FileCheck %s --check-prefixes=CHECK,CHECK-NONAN			; RUN: llc < %s -march=nvptx \| FileCheck %s --check-prefixes=CHECK,CHECK-NONAN
	; RUN: llc < %s -march=nvptx -mcpu=sm_80 \| FileCheck %s --check-prefixes=CHECK,CHECK-NAN			; RUN: llc < %s -march=nvptx -mcpu=sm_80 \| FileCheck %s --check-prefixes=CHECK,CHECK-NAN
				; RUN: %if ptxas %{ llc < %s -march=nvptx \| %ptxas-verify %}
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 \| %ptxas-verify -arch=sm_80 %}

	; ---- minimum ----			; ---- minimum ----

	; CHECK-LABEL: minimum_half			; CHECK-LABEL: minimum_half
	define half @minimum_half(half %a) #0 {			define half @minimum_half(half %a) #0 {
	; CHECK-NONAN: setp			; CHECK-NONAN: setp
	; CHECK-NONAN: selp.b16			; CHECK-NONAN: selp.b16
	; CHECK-NAN: min.NaN.f16			; CHECK-NAN: min.NaN.f16
	▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/fns.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	declare i32 @llvm.nvvm.fns(i32, i32, i32)			declare i32 @llvm.nvvm.fns(i32, i32, i32)

	; CHECK-LABEL: .func{{.*}}fns			; CHECK-LABEL: .func{{.*}}fns
	define i32 @fns(i32 %mask, i32 %base, i32 %offset) {			define i32 @fns(i32 %mask, i32 %base, i32 %offset) {
	; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [fns_param_0];			; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [fns_param_0];
	; CHECK: ld.param.u32 [[BASE:%r[0-9]+]], [fns_param_1];			; CHECK: ld.param.u32 [[BASE:%r[0-9]+]], [fns_param_1];
	; CHECK: ld.param.u32 [[OFFSET:%r[0-9]+]], [fns_param_2];			; CHECK: ld.param.u32 [[OFFSET:%r[0-9]+]], [fns_param_2];
	Show All 27 Lines

llvm/test/CodeGen/NVPTX/fp-contract.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast \| FileCheck %s --check-prefix=FAST			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast \| FileCheck %s --check-prefix=FAST
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 \| FileCheck %s --check-prefix=DEFAULT			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 \| FileCheck %s --check-prefix=DEFAULT
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	target triple = "nvptx64-unknown-cuda"			target triple = "nvptx64-unknown-cuda"

	;; Make sure we are generating proper instruction sequences for fused ops			;; Make sure we are generating proper instruction sequences for fused ops
	;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit			;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
	;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas			;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas
	;; is free to fuse with a multiply if it is able. If fusion is not allowed,			;; is free to fuse with a multiply if it is able. If fusion is not allowed,
	;; we do not form fma.rn at the PTX level and explicitly generate add.rn			;; we do not form fma.rn at the PTX level and explicitly generate add.rn
	Show All 23 Lines

llvm/test/CodeGen/NVPTX/fp-literals.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast \| %ptxas-verify %}

	target triple = "nvptx64-unknown-cuda"			target triple = "nvptx64-unknown-cuda"
	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	; Make sure we can properly differentiate between single-precision and			; Make sure we can properly differentiate between single-precision and
	; double-precision FP literals.			; double-precision FP literals.

	; CHECK: myaddf			; CHECK: myaddf
	Show All 12 Lines

llvm/test/CodeGen/NVPTX/fp16.ll

	; RUN: llc -march=nvptx -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=nvptx -verify-machineinstrs < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s \| %ptxas-verify %}

	declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone			declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
	declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone			declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
	declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone			declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
	declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone			declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone

	; CHECK-LABEL: @test_convert_fp16_to_fp32			; CHECK-LABEL: @test_convert_fp16_to_fp32
	; CHECK: cvt.f32.f16			; CHECK: cvt.f32.f16
	Show All 36 Lines

llvm/test/CodeGen/NVPTX/function-align.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-NOT: .align 2			; CHECK-NOT: .align 2
	define ptx_device void @foo() align 2 {			define ptx_device void @foo() align 2 {
	; CHECK-LABEL: .func foo			; CHECK-LABEL: .func foo
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx-nvidia-cuda"			target triple = "nvptx-nvidia-cuda"

	; Ensure global variables in address space 0 are promoted to address space 1			; Ensure global variables in address space 0 are promoted to address space 1

	; CHECK: .global .align 4 .u32 myglobal = 42;			; CHECK: .global .align 4 .u32 myglobal = 42;
	@myglobal = internal global i32 42, align 4			@myglobal = internal global i32 42, align 4
	Show All 20 Lines

llvm/test/CodeGen/NVPTX/global-addrspace.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; PTX32: .visible .global .align 4 .u32 i;			; PTX32: .visible .global .align 4 .u32 i;
	; PTX32: .visible .const .align 4 .u32 j;			; PTX32: .visible .const .align 4 .u32 j;
	; PTX32: .visible .shared .align 4 .u32 k;			; PTX32: .visible .shared .align 4 .u32 k;
	; PTX64: .visible .global .align 4 .u32 i;			; PTX64: .visible .global .align 4 .u32 i;
	; PTX64: .visible .const .align 4 .u32 j;			; PTX64: .visible .const .align 4 .u32 j;
	; PTX64: .visible .shared .align 4 .u32 k;			; PTX64: .visible .shared .align 4 .u32 k;
	@i = addrspace(1) externally_initialized global i32 0, align 4			@i = addrspace(1) externally_initialized global i32 0, align 4
	@j = addrspace(4) externally_initialized global i32 0, align 4			@j = addrspace(4) externally_initialized global i32 0, align 4
	@k = addrspace(3) global i32 undef, align 4			@k = addrspace(3) global i32 undef, align 4

llvm/test/CodeGen/NVPTX/global-ordering.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; Make sure we emit these globals in def-use order			; Make sure we emit these globals in def-use order


	; PTX32: .visible .global .align 1 .u8 a = 2;			; PTX32: .visible .global .align 1 .u8 a = 2;
	; PTX32-NEXT: .visible .global .align 4 .u32 a2 = a;			; PTX32-NEXT: .visible .global .align 4 .u32 a2 = a;
	; PTX64: .visible .global .align 1 .u8 a = 2;			; PTX64: .visible .global .align 1 .u8 a = 2;
	; PTX64-NEXT: .visible .global .align 8 .u64 a2 = a;			; PTX64-NEXT: .visible .global .align 8 .u64 a2 = a;
	Show All 10 Lines

llvm/test/CodeGen/NVPTX/global-variable-big.ll

	; RUN: llc < %s \| FileCheck %s			; RUN: llc < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s \| %ptxas-verify %}

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; Check that we can handle global variables of large integer type.			; Check that we can handle global variables of large integer type.

	; (lsb) 0x0102'0304'0506...0F10 (msb)			; (lsb) 0x0102'0304'0506...0F10 (msb)
	@gv = addrspace(1) externally_initialized global i128 21345817372864405881847059188222722561, align 16			@gv = addrspace(1) externally_initialized global i128 21345817372864405881847059188222722561, align 16
	; CHECK: .visible .global .align 16 .b8 gv[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};			; CHECK: .visible .global .align 16 .b8 gv[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};

llvm/test/CodeGen/NVPTX/global-visibility.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; PTX does not support .hidden or .protected.			; PTX does not support .hidden or .protected.
	; Make sure we do not emit them.			; Make sure we do not emit them.

	define hidden void @f_hidden() {			define hidden void @f_hidden() {
	ret void			ret void
	}			}
	; CHECK-NOT: .hidden			; CHECK-NOT: .hidden
	; CHECK: .visible .func f_hidden			; CHECK: .visible .func f_hidden

	define protected void @f_protected() {			define protected void @f_protected() {
	ret void			ret void
	}			}
	; CHECK-NOT: .protected			; CHECK-NOT: .protected
	; CHECK: .visible .func f_protected			; CHECK: .visible .func f_protected

llvm/test/CodeGen/NVPTX/globals_init.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; Make sure the globals constant initializers are not prone to host endianess			; Make sure the globals constant initializers are not prone to host endianess
	; issues.			; issues.

	; CHECK-DAG: .b8 Gbli08[2] = {171, 205};			; CHECK-DAG: .b8 Gbli08[2] = {171, 205};
	@Gbli08 = global [2 x i8] [i8 171, i8 205]			@Gbli08 = global [2 x i8] [i8 171, i8 205]

	; CHECK-DAG: .b8 Gbli16[4] = {205, 171, 1, 239};			; CHECK-DAG: .b8 Gbli16[4] = {205, 171, 1, 239};
	Show All 21 Lines

llvm/test/CodeGen/NVPTX/globals_lowering.ll

	; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static \| FileCheck %s --check-prefix CHK			; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static \| FileCheck %s --check-prefix CHK
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static \| %ptxas-verify %}

	%MyStruct = type { i32, i32, float }			%MyStruct = type { i32, i32, float }
	@Gbl = internal addrspace(3) global [1024 x %MyStruct] zeroinitializer			@Gbl = internal addrspace(3) global [1024 x %MyStruct] zeroinitializer

	; CHK-LABEL: foo			; CHK-LABEL: foo
	define void @foo(float %f) {			define void @foo(float %f) {
	entry:			entry:
	; CHK: ld.shared.f32 %{{[a-zA-Z0-9]+}}, [Gbl+8];			; CHK: ld.shared.f32 %{{[a-zA-Z0-9]+}}, [Gbl+8];
	%0 = load float, float addrspace(3)* getelementptr inbounds ([1024 x %MyStruct], [1024 x %MyStruct] addrspace(3)* @Gbl, i32 0, i32 0, i32 2)			%0 = load float, float addrspace(3)* getelementptr inbounds ([1024 x %MyStruct], [1024 x %MyStruct] addrspace(3)* @Gbl, i32 0, i32 0, i32 2)
	%add = fadd float %0, %f			%add = fadd float %0, %f
	; CHK: st.shared.f32 [Gbl+8], %{{[a-zA-Z0-9]+}};			; CHK: st.shared.f32 [Gbl+8], %{{[a-zA-Z0-9]+}};
	store float %add, float addrspace(3)* getelementptr inbounds ([1024 x %MyStruct], [1024 x %MyStruct] addrspace(3)* @Gbl, i32 0, i32 0, i32 2)			store float %add, float addrspace(3)* getelementptr inbounds ([1024 x %MyStruct], [1024 x %MyStruct] addrspace(3)* @Gbl, i32 0, i32 0, i32 2)
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/half.ll

	; RUN: llc < %s -march=nvptx \| FileCheck %s			; RUN: llc < %s -march=nvptx \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx \| %ptxas-verify %}

	; CHECK: .b8 half_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};			; CHECK: .b8 half_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
	@"half_array" = addrspace(1) constant [4 x half]			@"half_array" = addrspace(1) constant [4 x half]
	[half 0xH0201, half 0xH0403, half 0xH0605, half 0xH0807]			[half 0xH0201, half 0xH0403, half 0xH0605, half 0xH0807]

	define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {			define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
	; CHECK-LABEL: @test_load_store			; CHECK-LABEL: @test_load_store
	; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]			; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]
	▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/i1-global.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx-nvidia-cuda"			target triple = "nvptx-nvidia-cuda"

	; CHECK: .visible .global .align 1 .u8 mypred			; CHECK: .visible .global .align 1 .u8 mypred
	@mypred = addrspace(1) global i1 true, align 1			@mypred = addrspace(1) global i1 true, align 1


	Show All 10 Lines

llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: foo			; CHECK-LABEL: foo
	; CHECK: setp			; CHECK: setp
	; CHECK: selp			; CHECK: selp
	; CHECK: cvt.rn.f32.u32			; CHECK: cvt.rn.f32.u32
	define float @foo(i1 %a) {			define float @foo(i1 %a) {
	%ret = uitofp i1 %a to float			%ret = uitofp i1 %a to float
	ret float %ret			ret float %ret
	Show All 28 Lines

llvm/test/CodeGen/NVPTX/i1-param.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx-nvidia-cuda"			target triple = "nvptx-nvidia-cuda"

	; Make sure predicate (i1) operands to kernels get expanded out to .u8			; Make sure predicate (i1) operands to kernels get expanded out to .u8

	; CHECK: .entry foo			; CHECK: .entry foo
	; CHECK: .param .u8 foo_param_0			; CHECK: .param .u8 foo_param_0
	Show All 10 Lines

llvm/test/CodeGen/NVPTX/i128-global.ll

	; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};			; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
	@G1 = global i128 1			@G1 = global i128 1

	; CHECK: .visible .global .align 16 .b8 G2[16];			; CHECK: .visible .global .align 16 .b8 G2[16];
	@G2 = global i128 0			@G2 = global i128 0

llvm/test/CodeGen/NVPTX/i128-param.ll

	; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: .visible .func callee(			; CHECK-LABEL: .visible .func callee(
	; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16],			; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16],
	; CHECK-NEXT: .param .align 16 .b8 callee_param_1[16],			; CHECK-NEXT: .param .align 16 .b8 callee_param_1[16],
	define void @callee(i128, i128, i128*) {			define void @callee(i128, i128, i128*) {
	; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];			; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
	; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1];			; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1];

	▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/i128-retval.ll

	; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee(			; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee(
	define i128 @callee(i128) {			define i128 @callee(i128) {
	; CHECK: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];			; CHECK: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
	; CHECK: st.param.v2.b64 [func_retval0+0], {%[[REG0]], %[[REG1]]}			; CHECK: st.param.v2.b64 [func_retval0+0], {%[[REG0]], %[[REG1]]}
	ret i128 %0			ret i128 %0
	}			}

	Show All 19 Lines

llvm/test/CodeGen/NVPTX/i128-struct.ll

	; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[32]) foo(			; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[32]) foo(
	define { i128, i128 } @foo(i64 %a, i32 %b) {			define { i128, i128 } @foo(i64 %a, i32 %b) {
	%1 = sext i64 %a to i128			%1 = sext i64 %a to i128
	%2 = sext i32 %b to i128			%2 = sext i32 %b to i128
	%3 = insertvalue { i128, i128 } undef, i128 %1, 0			%3 = insertvalue { i128, i128 } undef, i128 %1, 0
	%4 = insertvalue { i128, i128 } %3, i128 %2, 1			%4 = insertvalue { i128, i128 } %3, i128 %2, 1

	; CHECK: st.param.v2.b64 [func_retval0+0], {%[[REG1:rd[0-9]+]], %[[REG2:rd[0-9]+]]};			; CHECK: st.param.v2.b64 [func_retval0+0], {%[[REG1:rd[0-9]+]], %[[REG2:rd[0-9]+]]};
	; CHECK: st.param.v2.b64 [func_retval0+16], {%[[REG3:rd[0-9]+]], %[[REG4:rd[0-9]+]]};			; CHECK: st.param.v2.b64 [func_retval0+16], {%[[REG3:rd[0-9]+]], %[[REG4:rd[0-9]+]]};
	ret { i128, i128 } %4			ret { i128, i128 } %4
	}			}

llvm/test/CodeGen/NVPTX/i8-param.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	; CHECK: .visible .func (.param .b32 func_retval0) callee			; CHECK: .visible .func (.param .b32 func_retval0) callee
	define i8 @callee(i8 %a) {			define i8 @callee(i8 %a) {
	; CHECK: ld.param.u8			; CHECK: ld.param.u8
	%ret = add i8 %a, 42			%ret = add i8 %a, 42
	; CHECK: st.param.b32			; CHECK: st.param.b32
	Show All 14 Lines

llvm/test/CodeGen/NVPTX/idioms.ll

	; Check that various LLVM idioms get lowered to NVPTX as expected.			; Check that various LLVM idioms get lowered to NVPTX as expected.

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: abs_i16(			; CHECK-LABEL: abs_i16(
	define i16 @abs_i16(i16 %a) {			define i16 @abs_i16(i16 %a) {
	; CHECK: abs.s16			; CHECK: abs.s16
	%neg = sub i16 0, %a			%neg = sub i16 0, %a
	%abs.cond = icmp sge i16 %a, 0			%abs.cond = icmp sge i16 %a, 0
	%abs = select i1 %abs.cond, i16 %a, i16 %neg			%abs = select i1 %abs.cond, i16 %a, i16 %neg
	ret i16 %abs			ret i16 %abs
	Show All 19 Lines

llvm/test/CodeGen/NVPTX/imad.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: imad			; CHECK: imad
	define i32 @imad(i32 %a, i32 %b, i32 %c) {			define i32 @imad(i32 %a, i32 %b, i32 %c) {
	; CHECK: mad.lo.s32			; CHECK: mad.lo.s32
	%val0 = mul i32 %a, %b			%val0 = mul i32 %a, %b
	%val1 = add i32 %val0, %c			%val1 = add i32 %val0, %c
	ret i32 %val1			ret i32 %val1
	}			}

llvm/test/CodeGen/NVPTX/inline-asm.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	define float @test(float %x) {			define float @test(float %x) {
	entry:			entry:
	; CHECK: ex2.approx.ftz.f32 %f{{[0-9]+}}, %f{{[0-9]+}}			; CHECK: ex2.approx.ftz.f32 %f{{[0-9]+}}, %f{{[0-9]+}}
	%0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x)			%0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x)
	ret float %0			ret float %0
	}			}

	define i32 @foo(i1 signext %cond, i32 %a, i32 %b) #0 {			define i32 @foo(i1 signext %cond, i32 %a, i32 %b) #0 {
	entry:			entry:
	; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}			; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}
	%0 = tail call i32 asm "selp.b32 $0, $1, $2, $3;", "=r,r,r,b"(i32 %a, i32 %b, i1 %cond)			%0 = tail call i32 asm "selp.b32 $0, $1, $2, $3;", "=r,r,r,b"(i32 %a, i32 %b, i1 %cond)
	ret i32 %0			ret i32 %0
	}			}

llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll

	; RUN: llc -march=nvptx < %s \| FileCheck %s			; RUN: llc -march=nvptx < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -march=nvptx < %s \| %ptxas-verify %}

	; Test that %c works with immediates			; Test that %c works with immediates
	; CHECK-LABEL: test_inlineasm_c_output_template0			; CHECK-LABEL: test_inlineasm_c_output_template0
	; CHECK: //TEST 42			; CHECK: //TEST 42
	define dso_local i32 @test_inlineasm_c_output_template0() {			define dso_local i32 @test_inlineasm_c_output_template0() {
	tail call void asm sideeffect "//TEST ${0:c}", "i"(i32 42)			tail call void asm sideeffect "//TEST ${0:c}", "i"(i32 42)
	ret i32 42			ret i32 42
	}			}
	Show All 19 Lines

llvm/test/CodeGen/NVPTX/intrinsic-old.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck -allow-deprecated-dag-overlap %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck -allow-deprecated-dag-overlap %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck -allow-deprecated-dag-overlap %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck -allow-deprecated-dag-overlap %s
	; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -nvvm-intr-range \			; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -nvvm-intr-range \
	; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s			; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s
	; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \			; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \
	; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s			; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s
	; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \			; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \
	; RUN: -nvvm-intr-range -nvvm-intr-range-sm=30 \			; RUN: -nvvm-intr-range -nvvm-intr-range-sm=30 \
	; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s			; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s
	; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \			; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \
	; RUN: -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \			; RUN: -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \
	; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s			; RUN: \| FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	define ptx_device i32 @test_tid_x() {			define ptx_device i32 @test_tid_x() {
	; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;			; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
	; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[BLK_IDX_XY:[0-9]+]]			; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[BLK_IDX_XY:[0-9]+]]
	; CHECK: ret;			; CHECK: ret;
	%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()			%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
	ret i32 %x			ret i32 %x
	}			}
	▲ Show 20 Lines • Show All 318 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/intrinsics.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: test_fabsf(			; CHECK-LABEL: test_fabsf(
	define float @test_fabsf(float %f) {			define float @test_fabsf(float %f) {
	; CHECK: abs.f32			; CHECK: abs.f32
	%x = call float @llvm.fabs.f32(float %f)			%x = call float @llvm.fabs.f32(float %f)
	ret float %x			ret float %x
	}			}

	▲ Show 20 Lines • Show All 136 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/isspacep.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	declare i1 @llvm.nvvm.isspacep.const(i8*) readnone noinline			declare i1 @llvm.nvvm.isspacep.const(i8*) readnone noinline
	declare i1 @llvm.nvvm.isspacep.global(i8*) readnone noinline			declare i1 @llvm.nvvm.isspacep.global(i8*) readnone noinline
	declare i1 @llvm.nvvm.isspacep.local(i8*) readnone noinline			declare i1 @llvm.nvvm.isspacep.local(i8*) readnone noinline
	declare i1 @llvm.nvvm.isspacep.shared(i8*) readnone noinline			declare i1 @llvm.nvvm.isspacep.shared(i8*) readnone noinline

	; CHECK: is_const			; CHECK: is_const
	define i1 @is_const(i8* %addr) {			define i1 @is_const(i8* %addr) {
	Show All 26 Lines

llvm/test/CodeGen/NVPTX/ld-addrspace.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G32,LS32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G32,LS32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G64,LS64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G64,LS64
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr \| FileCheck %s --check-prefixes=G64,LS32			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr \| FileCheck %s --check-prefixes=G64,LS32
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr \| %ptxas-verify %}


	;; i8			;; i8
	define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {			define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
	; ALL-LABEL: ld_global_i8			; ALL-LABEL: ld_global_i8
	; G32: ld.global.u8 %{{.*}}, [%r{{[0-9]+}}]			; G32: ld.global.u8 %{{.*}}, [%r{{[0-9]+}}]
	; G64: ld.global.u8 %{{.*}}, [%rd{{[0-9]+}}]			; G64: ld.global.u8 %{{.*}}, [%rd{{[0-9]+}}]
	; ALL: ret			; ALL: ret
	▲ Show 20 Lines • Show All 149 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/ld-generic.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}


	;; i8			;; i8
	define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {			define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
	; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]			; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
	; PTX32: ret			; PTX32: ret
	; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]			; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
	; PTX64: ret			; PTX64: ret
	▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py

	# This test generates all variants of load/store instructions and verifies that			# This test generates all variants of load/store instructions and verifies that
	# LLVM generates correct PTX for them.			# LLVM generates correct PTX for them.

	# RUN: %python %s > %t.ll			# RUN: %python %s > %t.ll
	# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_30 \| FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll			# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_30 \| FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll
	# RUN: llc < %t.ll -march=nvptx -mcpu=sm_30 \| FileCheck -check-prefixes=CHECK,CHECK_P32 %t.ll			# RUN: llc < %t.ll -march=nvptx -mcpu=sm_30 \| FileCheck -check-prefixes=CHECK,CHECK_P32 %t.ll
				# RUN: %if ptxas %{ llc < %t.ll -march=nvptx64 -mcpu=sm_30 \| %ptxas-verify %}
				# RUN: %if ptxas %{ llc < %t.ll -march=nvptx -mcpu=sm_30 \| %ptxas-verify %}

	from __future__ import print_function			from __future__ import print_function

	from itertools import product			from itertools import product
	from string import Template			from string import Template

	llvm_type_to_ptx_type = {			llvm_type_to_ptx_type = {
	"i8": "u8",			"i8": "u8",
	▲ Show 20 Lines • Show All 90 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/ldg-invariant.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	; Check that invariant loads from the global addrspace are lowered to			; Check that invariant loads from the global addrspace are lowered to
	; ld.global.nc.			; ld.global.nc.

	; CHECK-LABEL: @ld_global			; CHECK-LABEL: @ld_global
	define i32 @ld_global(i32 addrspace(1)* %ptr) {			define i32 @ld_global(i32 addrspace(1)* %ptr) {
	; CHECK: ld.global.nc.{{[a-z]}}32			; CHECK: ld.global.nc.{{[a-z]}}32
	%a = load i32, i32 addrspace(1)* %ptr, !invariant.load !0			%a = load i32, i32 addrspace(1)* %ptr, !invariant.load !0
	▲ Show 20 Lines • Show All 87 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/ldparam-v4.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	declare <4 x float> @bar()			declare <4 x float> @bar()

	; CHECK-LABEL: .func foo(			; CHECK-LABEL: .func foo(
	define void @foo(<4 x float>* %ptr) {			define void @foo(<4 x float>* %ptr) {
	; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0];			; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0];
	; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0];			; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0];
	; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}			; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
	%val = tail call <4 x float> @bar()			%val = tail call <4 x float> @bar()
	store <4 x float> %val, <4 x float>* %ptr			store <4 x float> %val, <4 x float>* %ptr
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/ldu-i8.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*, i32)			declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*, i32)

	define i8 @foo(i8* %a) {			define i8 @foo(i8* %a) {
	; Ensure we properly truncate off the high-order 24 bits			; Ensure we properly truncate off the high-order 24 bits
	; CHECK: ldu.global.u8			; CHECK: ldu.global.u8
	; CHECK: cvt.u32.u16			; CHECK: cvt.u32.u16
	; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 255			; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 255
	%val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a, i32 4)			%val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a, i32 4)
	ret i8 %val			ret i8 %val
	}			}

llvm/test/CodeGen/NVPTX/ldu-ldg.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}


	declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)			declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
	declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)			declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
	declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)			declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
	declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)			declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)


	Show All 27 Lines

llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"


	define void @reg_plus_offset(i32* %a) {			define void @reg_plus_offset(i32* %a) {
	; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+32];			; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
	; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+36];			; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
	%p2 = getelementptr i32, i32* %a, i32 8			%p2 = getelementptr i32, i32* %a, i32 8
	Show All 10 Lines

llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll

	; RUN: llc < %s -march=nvptx 2>&1 \| FileCheck %s			; RUN: llc < %s -march=nvptx 2>&1 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx \| %ptxas-verify %}

	; Allow to make libcalls that are defined in the current module			; Allow to make libcalls that are defined in the current module

	declare i8* @malloc(i64)			declare i8* @malloc(i64)
	declare void @free(i8*)			declare void @free(i8*)

	; Underlying libcall declaration			; Underlying libcall declaration
	; CHECK: .visible .func (.param .align 16 .b8 func_retval0[16]) __umodti3			; CHECK: .visible .func (.param .align 16 .b8 func_retval0[16]) __umodti3

	Show All 36 Lines

llvm/test/CodeGen/NVPTX/load-sext-i1.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx-nvidia-cuda"			target triple = "nvptx-nvidia-cuda"

	define void @main(i1* %a1, i32 %a2, i32* %arg3) {			define void @main(i1* %a1, i32 %a2, i32* %arg3) {
	; CHECK: ld.u8			; CHECK: ld.u8
	; CHECK-NOT: ld.u1			; CHECK-NOT: ld.u1
	%t1 = getelementptr i1, i1* %a1, i32 %a2			%t1 = getelementptr i1, i1* %a1, i32 %a2
	%t2 = load i1, i1* %t1			%t2 = load i1, i1* %t1
	%t3 = sext i1 %t2 to i32			%t3 = sext i1 %t2 to i32
	store i32 %t3, i32* %arg3			store i32 %t3, i32* %arg3
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/load-store.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: plain			; CHECK-LABEL: plain
	define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {			define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
	; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]			; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
	%a.load = load i8, i8* %a			%a.load = load i8, i8* %a
	%a.add = add i8 %a.load, 1			%a.add = add i8 %a.load, 1
	; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}			; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
	store i8 %a.add, i8* %a			store i8 %a.add, i8* %a
	▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck -check-prefix=SM20 %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck -check-prefix=SM20 %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck -check-prefix=SM35 %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck -check-prefix=SM35 %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-unknown-unknown"			target triple = "nvptx64-unknown-unknown"

	; SM20-LABEL: .visible .entry foo1(			; SM20-LABEL: .visible .entry foo1(
	; SM20: ld.global.f32			; SM20: ld.global.f32
	; SM35-LABEL: .visible .entry foo1(			; SM35-LABEL: .visible .entry foo1(
	; SM35: ld.global.nc.f32			; SM35: ld.global.nc.f32
	▲ Show 20 Lines • Show All 254 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/local-stack-frame.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=PTX32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=PTX32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=PTX64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=PTX64
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	; Ensure we access the local stack properly			; Ensure we access the local stack properly

	; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};			; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};
	; PTX32: cvta.local.u32 %SP, %SPL;			; PTX32: cvta.local.u32 %SP, %SPL;
	; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];			; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
	; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}};			; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
	; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};			; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};
	▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 \| FileCheck %s --check-prefix PTX			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 \| FileCheck %s --check-prefix PTX
	; RUN: opt < %s -S -nvptx-lower-aggr-copies \| FileCheck %s --check-prefix IR			; RUN: opt < %s -S -nvptx-lower-aggr-copies \| FileCheck %s --check-prefix IR
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 \| %ptxas-verify -arch=sm_35 %}

	; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to			; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
	; llvm.mem* intrinsics get lowered to loops.			; llvm.mem* intrinsics get lowered to loops.

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "nvptx64-unknown-unknown"			target triple = "nvptx64-unknown-unknown"

	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) #1			declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) #1
	▲ Show 20 Lines • Show All 174 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/lower-alloca.ll

	; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces \| FileCheck %s			; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s --check-prefix PTX			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s --check-prefix PTX
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-unknown-unknown"			target triple = "nvptx64-unknown-unknown"

	define void @kernel() {			define void @kernel() {
	; LABEL: @lower_alloca			; LABEL: @lower_alloca
	; PTX-LABEL: .visible .entry kernel(			; PTX-LABEL: .visible .entry kernel(
	%A = alloca i32			%A = alloca i32
	Show All 12 Lines

llvm/test/CodeGen/NVPTX/lower-args.ll

	; RUN: opt < %s -S -nvptx-lower-args \| FileCheck %s --check-prefix IR			; RUN: opt < %s -S -nvptx-lower-args \| FileCheck %s --check-prefix IR
	; RUN: llc < %s -mcpu=sm_20 \| FileCheck %s --check-prefix PTX			; RUN: llc < %s -mcpu=sm_20 \| FileCheck %s --check-prefix PTX
				; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	%class.outer = type <{ %class.inner, i32, [4 x i8] }>			%class.outer = type <{ %class.inner, i32, [4 x i8] }>
	%class.inner = type { i32, i32 }			%class.inner = type { i32, i32 }

	; Check that nvptx-lower-args preserves arg alignment			; Check that nvptx-lower-args preserves arg alignment
	Show All 24 Lines

llvm/test/CodeGen/NVPTX/lower-byval-args.ll

	; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefixes=CHECK,CHECK64			; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefixes=CHECK,CHECK64
	; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefixes=CHECK,CHECK32			; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefixes=CHECK,CHECK32
				; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	%struct.ham = type { [4 x i32] }			%struct.ham = type { [4 x i32] }

	; // Verify that load with static offset into parameter is done directly.			; // Verify that load with static offset into parameter is done directly.
	; CHECK-LABEL: .visible .entry static_offset			; CHECK-LABEL: .visible .entry static_offset
	; CHECK-NOT: .local			; CHECK-NOT: .local
	; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]			; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
	; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1			; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
	▲ Show 20 Lines • Show All 156 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; Verify that both %input and %output are converted to global pointers and then			; Verify that both %input and %output are converted to global pointers and then
	; addrspacecast'ed back to the original type.			; addrspacecast'ed back to the original type.
	define void @kernel(float* %input, float* %output) {			define void @kernel(float* %input, float* %output) {
	; CHECK-LABEL: .visible .entry kernel(			; CHECK-LABEL: .visible .entry kernel(
	▲ Show 20 Lines • Show All 56 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/machine-sink.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	@scalar1 = internal addrspace(3) global float 0.000000e+00, align 4			@scalar1 = internal addrspace(3) global float 0.000000e+00, align 4
	@scalar2 = internal addrspace(3) global float 0.000000e+00, align 4			@scalar2 = internal addrspace(3) global float 0.000000e+00, align 4

	; We shouldn't sink mul.rn.f32 to BB %merge because BB %merge post-dominates			; We shouldn't sink mul.rn.f32 to BB %merge because BB %merge post-dominates
	; BB %entry. Over-sinking created more register pressure on this example. The			; BB %entry. Over-sinking created more register pressure on this example. The
	Show All 31 Lines

llvm/test/CodeGen/NVPTX/managed.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 2>&1 \| FileCheck %s --check-prefix ERROR			; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 2>&1 \| FileCheck %s --check-prefix ERROR
	; ERROR: LLVM ERROR: .attribute(.managed) requires PTX version >= 4.0 and sm_30			; ERROR: LLVM ERROR: .attribute(.managed) requires PTX version >= 4.0 and sm_30

	; CHECK: .visible .global .align 4 .u32 device_g;			; CHECK: .visible .global .align 4 .u32 device_g;
	@device_g = addrspace(1) global i32 zeroinitializer			@device_g = addrspace(1) global i32 zeroinitializer
	; CHECK: .visible .global .attribute(.managed) .align 4 .u32 managed_g;			; CHECK: .visible .global .attribute(.managed) .align 4 .u32 managed_g;
	@managed_g = addrspace(1) global i32 zeroinitializer			@managed_g = addrspace(1) global i32 zeroinitializer
	Show All 9 Lines

llvm/test/CodeGen/NVPTX/match.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \| %ptxas-verify -arch=sm_70 %}

	declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32)			declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32)
	declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64)			declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64)

	; CHECK-LABEL: .func{{.*}}match_any_sync_i32			; CHECK-LABEL: .func{{.*}}match_any_sync_i32
	define i32 @match_any_sync_i32(i32 %mask, i32 %value) {			define i32 @match_any_sync_i32(i32 %mask, i32 %value) {
	; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match_any_sync_i32_param_0];			; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match_any_sync_i32_param_0];
	; CHECK: ld.param.u32 [[VALUE:%r[0-9]+]], [match_any_sync_i32_param_1];			; CHECK: ld.param.u32 [[VALUE:%r[0-9]+]], [match_any_sync_i32_param_1];
	▲ Show 20 Lines • Show All 108 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 \| %ptxas-verify -arch=sm_53 %}

	declare half @llvm.nvvm.fma.rn.f16(half, half, half)			declare half @llvm.nvvm.fma.rn.f16(half, half, half)
	declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)			declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)
	declare half @llvm.nvvm.fma.rn.sat.f16(half, half, half)			declare half @llvm.nvvm.fma.rn.sat.f16(half, half, half)
	declare half @llvm.nvvm.fma.rn.ftz.sat.f16(half, half, half)			declare half @llvm.nvvm.fma.rn.ftz.sat.f16(half, half, half)
	declare <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half>, <2 x half>, <2 x half>)			declare <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half>, <2 x half>, <2 x half>)
	declare <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half>, <2 x half>, <2 x half>)			declare <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half>, <2 x half>, <2 x half>)
	declare <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)			declare <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
	▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck %s
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| %ptxas-verify -arch=sm_80 %}

	declare i16 @llvm.nvvm.abs.bf16(i16)			declare i16 @llvm.nvvm.abs.bf16(i16)
	declare i32 @llvm.nvvm.abs.bf16x2(i32)			declare i32 @llvm.nvvm.abs.bf16x2(i32)
	declare i16 @llvm.nvvm.neg.bf16(i16)			declare i16 @llvm.nvvm.neg.bf16(i16)
	declare i32 @llvm.nvvm.neg.bf16x2(i32)			declare i32 @llvm.nvvm.neg.bf16x2(i32)

	declare float @llvm.nvvm.fmin.nan.f(float, float)			declare float @llvm.nvvm.fmin.nan.f(float, float)
	declare float @llvm.nvvm.fmin.ftz.nan.f(float, float)			declare float @llvm.nvvm.fmin.ftz.nan.f(float, float)
	▲ Show 20 Lines • Show All 356 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 \| FileCheck %s
				; RUN: %if ptxas-11.2 %{ llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 \| %ptxas-verify -arch=sm_86 %}

	declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half)			declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half)
	declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half)			declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half)
	declare half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half, half)			declare half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half, half)
	declare half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half, half)			declare half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half, half)
	declare <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> , <2 x half>)			declare <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> , <2 x half>)
	declare <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)			declare <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)
	declare <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)			declare <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
	▲ Show 20 Lines • Show All 282 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/math-intrins.ll

	; RUN: llc < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-NOF16			; RUN: llc < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
	; RUN: llc < %s -mcpu=sm_80 \| FileCheck %s --check-prefixes=CHECK,CHECK-F16			; RUN: llc < %s -mcpu=sm_80 \| FileCheck %s --check-prefixes=CHECK,CHECK-F16
	; RUN: llc < %s -mcpu=sm_80 --nvptx-no-f16-math \| FileCheck %s --check-prefixes=CHECK,CHECK-NOF16			; RUN: llc < %s -mcpu=sm_80 --nvptx-no-f16-math \| FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
				; RUN: %if ptxas %{ llc < %s \| %ptxas-verify %}
				; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 \| %ptxas-verify -arch=sm_80 %}
				; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math \| %ptxas-verify -arch=sm_80 %}

	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; Checks that llvm intrinsics for math functions are correctly lowered to PTX.			; Checks that llvm intrinsics for math functions are correctly lowered to PTX.

	declare float @llvm.ceil.f32(float) #0			declare float @llvm.ceil.f32(float) #0
	declare double @llvm.ceil.f64(double) #0			declare double @llvm.ceil.f64(double) #0
	declare float @llvm.floor.f32(float) #0			declare float @llvm.floor.f32(float) #0
	declare double @llvm.floor.f64(double) #0			declare double @llvm.floor.f64(double) #0
	▲ Show 20 Lines • Show All 321 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/mbarrier.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_80 \| FileCheck %s -check-prefix=CHECK_PTX32			; RUN: llc < %s -march=nvptx -mcpu=sm_80 \| FileCheck %s -check-prefix=CHECK_PTX32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 \| FileCheck %s -check-prefix=CHECK_PTX64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 \| FileCheck %s -check-prefix=CHECK_PTX64
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 \| %ptxas-verify -arch=sm_80 %}
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 \| %ptxas-verify -arch=sm_80 %}

	declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)			declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
	declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)			declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)

	; CHECK-LABEL: barrierinit			; CHECK-LABEL: barrierinit
	define void @barrierinit(i64* %a, i32 %b) {			define void @barrierinit(i64* %a, i32 %b) {
	; CHECK_PTX32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};			; CHECK_PTX32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
	; CHECK_PTX64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};			; CHECK_PTX64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
	▲ Show 20 Lines • Show All 135 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/minmax-negative.ll

	; RUN: llc < %s -march=nvptx -O0 \| FileCheck %s			; RUN: llc < %s -march=nvptx -O0 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -O0 \| %ptxas-verify %}

	define i16 @test1(i16* %sur1) {			define i16 @test1(i16* %sur1) {
	; CHECK-NOT: mov.u16 %rs{{[0-9]+}}, 32767			; CHECK-NOT: mov.u16 %rs{{[0-9]+}}, 32767
	%_tmp21.i = icmp sle i16 0, 0			%_tmp21.i = icmp sle i16 0, 0
	%_tmp22.i = select i1 %_tmp21.i, i16 0, i16 32767			%_tmp22.i = select i1 %_tmp21.i, i16 0, i16 32767
	store i16 %_tmp22.i, i16* %sur1			store i16 %_tmp22.i, i16* %sur1
	ret i16 0			ret i16 0
	}			}

llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; CHECK-LABEL: t1			; CHECK-LABEL: t1
	define <4 x float> @t1(i8* %p1) {			define <4 x float> @t1(i8* %p1) {
	; CHECK-NOT: ld.v4			; CHECK-NOT: ld.v4
	; CHECK-NOT: ld.v2			; CHECK-NOT: ld.v2
	▲ Show 20 Lines • Show All 126 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/module-inline-asm.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	; module asm must come after PTX version/target directives.			; module asm must come after PTX version/target directives.
	; CHECK-NOT: .global .b32 val;			; CHECK-NOT: .global .b32 val;

	; CHECK-DAG: .version			; CHECK-DAG: .version
	; CHECK-DAG: .target			; CHECK-DAG: .target
	Show All 11 Lines

llvm/test/CodeGen/NVPTX/mulwide.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 \| FileCheck %s --check-prefix=OPT			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 \| FileCheck %s --check-prefix=OPT
	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 \| FileCheck %s --check-prefix=NOOPT			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 \| FileCheck %s --check-prefix=NOOPT
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O3 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 \| %ptxas-verify %}

	; OPT-LABEL: @mulwide16			; OPT-LABEL: @mulwide16
	; NOOPT-LABEL: @mulwide16			; NOOPT-LABEL: @mulwide16
	define i32 @mulwide16(i16 %a, i16 %b) {			define i32 @mulwide16(i16 %a, i16 %b) {
	; OPT: mul.wide.s16			; OPT: mul.wide.s16
	; NOOPT: mul.lo.s32			; NOOPT: mul.lo.s32
	%val0 = sext i16 %a to i32			%val0 = sext i16 %a to i32
	%val1 = sext i16 %b to i32			%val1 = sext i16 %b to i32
	▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/named-barriers.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; Use bar.sync to arrive at a pre-computed barrier number and			; Use bar.sync to arrive at a pre-computed barrier number and
	; wait for all threads in CTA to also arrive:			; wait for all threads in CTA to also arrive:
	define ptx_device void @test_barrier_named_cta() {			define ptx_device void @test_barrier_named_cta() {
	; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0;			; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0;
	; CHECK: bar.sync %r[[REG0]];			; CHECK: bar.sync %r[[REG0]];
	; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10;			; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10;
	; CHECK: bar.sync %r[[REG1]];			; CHECK: bar.sync %r[[REG1]];
	Show All 30 Lines

llvm/test/CodeGen/NVPTX/no-extra-parens.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; ptxas has no special meaning for '$' character, so it should be used			; ptxas has no special meaning for '$' character, so it should be used
	; without parens.			; without parens.

	@"$str" = private addrspace(1) constant [4 x i8] c"str\00"			@"$str" = private addrspace(1) constant [4 x i8] c"str\00"

	declare void @str2(i8* %str)			declare void @str2(i8* %str)
	define void @str1() {			define void @str1() {
	entry:			entry:
	;; CHECK: mov.u64 %rd{{[0-9]+}}, $str;			;; CHECK: mov.u64 %rd{{[0-9]+}}, $str;
	tail call void @str2(i8* getelementptr ([4 x i8], [4 x i8]* addrspacecast ([4 x i8] addrspace(1)* @"$str" to [4 x i8]*), i64 0, i64 0))			tail call void @str2(i8* getelementptr ([4 x i8], [4 x i8]* addrspacecast ([4 x i8] addrspace(1)* @"$str" to [4 x i8]*), i64 0, i64 0))
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/nofunc.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; Test that we don't crash if we're compiling a module with function references,			; Test that we don't crash if we're compiling a module with function references,
	; but without any functions in it.			; but without any functions in it.

	target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	@Funcs = local_unnamed_addr addrspace(1) externally_initialized			@Funcs = local_unnamed_addr addrspace(1) externally_initialized
	global [1 x void (i8)] [void (i8) @func], align 8			global [1 x void (i8)] [void (i8) @func], align 8

	declare void @func(i8*)			declare void @func(i8*)

	; CHECK: Funcs[1] = {func}			; CHECK: Funcs[1] = {func}

llvm/test/CodeGen/NVPTX/nounroll.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-unknown-unknown"			target triple = "nvptx64-unknown-unknown"

	; Compiled from the following CUDA code:			; Compiled from the following CUDA code:
	;			;
	; #pragma nounroll			; #pragma nounroll
	; for (int i = 0; i < 2; ++i)			; for (int i = 0; i < 2; ++i)
	Show All 28 Lines

llvm/test/CodeGen/NVPTX/nvcl-param-align.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target triple = "nvptx-unknown-nvcl"			target triple = "nvptx-unknown-nvcl"

	define void @foo(i64 %img, i64 %sampler, <5 x float>* align 32 %v1, i32* %v2) {			define void @foo(i64 %img, i64 %sampler, <5 x float>* align 32 %v1, i32* %v2) {
	; The parameter alignment is determined by the align attribute (default 1).			; The parameter alignment is determined by the align attribute (default 1).
	; CHECK-LABEL: .entry foo(			; CHECK-LABEL: .entry foo(
	; CHECK: .param .u32 .ptr .align 32 foo_param_2			; CHECK: .param .u32 .ptr .align 32 foo_param_2
	; CHECK: .param .u32 .ptr .align 1 foo_param_3			; CHECK: .param .u32 .ptr .align 1 foo_param_3
	ret void			ret void
	}			}

	!nvvm.annotations = !{!1, !2, !3}			!nvvm.annotations = !{!1, !2, !3}
	!1 = !{void (i64, i64, <5 x float>, i32)* @foo, !"kernel", i32 1}			!1 = !{void (i64, i64, <5 x float>, i32)* @foo, !"kernel", i32 1}
	!2 = !{void (i64, i64, <5 x float>, i32)* @foo, !"rdoimage", i32 0}			!2 = !{void (i64, i64, <5 x float>, i32)* @foo, !"rdoimage", i32 0}
	!3 = !{void (i64, i64, <5 x float>, i32)* @foo, !"sampler", i32 1}			!3 = !{void (i64, i64, <5 x float>, i32)* @foo, !"sampler", i32 1}

llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll

	; RUN: llc < %s -mtriple=nvptx-unknown-unknown \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx-unknown-unknown \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown \| %ptxas-verify %}
	;			;
	; NVPTXTargetLowering::getFunctionParamOptimizedAlign, which was introduces in			; NVPTXTargetLowering::getFunctionParamOptimizedAlign, which was introduces in
	; D120129, contained a poorly designed assertion checking that a function with			; D120129, contained a poorly designed assertion checking that a function with
	; internal or private linkage is not a kernel. It relied on invariants that			; internal or private linkage is not a kernel. It relied on invariants that
	; were not actually guaranteed, and that resulted in compiler crash with some			; were not actually guaranteed, and that resulted in compiler crash with some
	; CUDA versions (see discussion with @jdoerfert in D120129). This test contains			; CUDA versions (see discussion with @jdoerfert in D120129). This test contains
	; metadata that caused compiler crash and a function with internal linkage			; metadata that caused compiler crash and a function with internal linkage
	; which purpose is to let compiler run on path where the crash happened.			; which purpose is to let compiler run on path where the crash happened.
	Show All 25 Lines

llvm/test/CodeGen/NVPTX/param-align.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	;;; Need 4-byte alignment on float* passed byval			;;; Need 4-byte alignment on float* passed byval
	define ptx_device void @t1(float* byval(float) %x) {			define ptx_device void @t1(float* byval(float) %x) {
	; CHECK: .func t1			; CHECK: .func t1
	; CHECK: .param .align 4 .b8 t1_param_0[4]			; CHECK: .param .align 4 .b8 t1_param_0[4]
	ret void			ret void
	}			}

	Show All 35 Lines

llvm/test/CodeGen/NVPTX/param-load-store.ll

	; Verifies correctness of load/store of parameters and return values.			; Verifies correctness of load/store of parameters and return values.
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs \| FileCheck -allow-deprecated-dag-overlap %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs \| FileCheck -allow-deprecated-dag-overlap %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs \| %ptxas-verify -arch=sm_35 %}

	%s_i1 = type { i1 }			%s_i1 = type { i1 }
	%s_i8 = type { i8 }			%s_i8 = type { i8 }
	%s_i16 = type { i16 }			%s_i16 = type { i16 }
	%s_f16 = type { half }			%s_f16 = type { half }
	%s_i32 = type { i32 }			%s_i32 = type { i32 }
	%s_f32 = type { float }			%s_f32 = type { float }
	%s_i64 = type { i64 }			%s_i64 = type { i64 }
	▲ Show 20 Lines • Show All 938 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/param-vectorize-device.ll

	; RUN: llc < %s -mtriple=nvptx-unknown-unknown \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx-unknown-unknown \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown \| %ptxas-verify %}
	;			;
	; Check that parameters of a __device__ function with private or internal			; Check that parameters of a __device__ function with private or internal
	; linkage called from a __global__ (kernel) function get increased alignment,			; linkage called from a __global__ (kernel) function get increased alignment,
	; and additional vectorization is performed on loads/stores with that			; and additional vectorization is performed on loads/stores with that
	; parameters.			; parameters.
	;			;
	; Test IR is a minimized version of IR generated with the following command			; Test IR is a minimized version of IR generated with the following command
	; from the source code below:			; from the source code below:
	▲ Show 20 Lines • Show All 746 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll

	; RUN: llc < %s -mtriple=nvptx-unknown-unknown \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx-unknown-unknown \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown \| %ptxas-verify %}
	;			;
	; Check that parameters of a __global__ (kernel) function do not get increased			; Check that parameters of a __global__ (kernel) function do not get increased
	; alignment, and no additional vectorization is performed on loads/stores with			; alignment, and no additional vectorization is performed on loads/stores with
	; that parameters.			; that parameters.
	;			;
	; Test IR is a minimized version of IR generated with the following command			; Test IR is a minimized version of IR generated with the following command
	; from the source code below:			; from the source code below:
	; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu			; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
	▲ Show 20 Lines • Show All 423 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll

	; RUN: llc -march=nvptx -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=nvptx -verify-machineinstrs < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s \| %ptxas-verify %}

	; Tests the following pattern:			; Tests the following pattern:
	; (X & 8) != 0 --> (X & 8) >> 3			; (X & 8) != 0 --> (X & 8) >> 3

	; This produces incorrect code in general when boolean false is			; This produces incorrect code in general when boolean false is
	; represented as a negative one. There is however a special			; represented as a negative one. There is however a special
	; case when the type has a bitsize of 1, for which the false			; case when the type has a bitsize of 1, for which the false
	; value will be identical regardless of the boolean representation.			; value will be identical regardless of the boolean representation.
	Show All 10 Lines

llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	define ptx_kernel void @t1(i1* %a) {			define ptx_kernel void @t1(i1* %a) {
	; PTX32: mov.u16 %rs{{[0-9]+}}, 0;			; PTX32: mov.u16 %rs{{[0-9]+}}, 0;
	; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};			; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
	; PTX64: mov.u16 %rs{{[0-9]+}}, 0;			; PTX64: mov.u16 %rs{{[0-9]+}}, 0;
	; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};			; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
	store i1 false, i1* %a			store i1 false, i1* %a
	ret void			ret void
	Show All 16 Lines

llvm/test/CodeGen/NVPTX/pr16278.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	@one_f = addrspace(4) global float 1.000000e+00, align 4			@one_f = addrspace(4) global float 1.000000e+00, align 4

	define float @foo() {			define float @foo() {
	; CHECK: ld.const.f32			; CHECK: ld.const.f32
	%val = load float, float addrspace(4)* @one_f			%val = load float, float addrspace(4)* @one_f
	ret float %val			ret float %val
	}			}

llvm/test/CodeGen/NVPTX/pr17529.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; Function Attrs: nounwind			; Function Attrs: nounwind
	; CHECK: .func kernelgen_memcpy			; CHECK: .func kernelgen_memcpy
	define ptx_device void @kernelgen_memcpy(i8* nocapture %dst) #0 {			define ptx_device void @kernelgen_memcpy(i8* nocapture %dst) #0 {
	entry:			entry:
	Show All 29 Lines

llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	; Check load from constant global variables. These loads should be			; Check load from constant global variables. These loads should be
	; ld.global.nc (aka ldg).			; ld.global.nc (aka ldg).

	@gv_float = external constant float			@gv_float = external constant float
	@gv_float2 = external constant <2 x float>			@gv_float2 = external constant <2 x float>
	@gv_float4 = external constant <4 x float>			@gv_float4 = external constant <4 x float>

	Show All 20 Lines

llvm/test/CodeGen/NVPTX/redux-sync.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| FileCheck %s
				; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 \| %ptxas-verify -arch=sm_80 %}

	declare i32 @llvm.nvvm.redux.sync.umin(i32, i32)			declare i32 @llvm.nvvm.redux.sync.umin(i32, i32)
	; CHECK-LABEL: .func{{.*}}redux_sync_min_u32			; CHECK-LABEL: .func{{.*}}redux_sync_min_u32
	define i32 @redux_sync_min_u32(i32 %src, i32 %mask) {			define i32 @redux_sync_min_u32(i32 %src, i32 %mask) {
	; CHECK: redux.sync.min.u32			; CHECK: redux.sync.min.u32
	%val = call i32 @llvm.nvvm.redux.sync.umin(i32 %src, i32 %mask)			%val = call i32 @llvm.nvvm.redux.sync.umin(i32 %src, i32 %mask)
	ret i32 %val			ret i32 %val
	}			}
	▲ Show 20 Lines • Show All 56 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/refl1.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target triple = "nvptx-nvidia-cuda"			target triple = "nvptx-nvidia-cuda"

	; Function Attrs: nounwind			; Function Attrs: nounwind
	; CHECK: .entry foo			; CHECK: .entry foo
	define void @foo(float* nocapture %a) #0 {			define void @foo(float* nocapture %a) #0 {
	%val = load float, float* %a			%val = load float, float* %a
	%tan = tail call fastcc float @__nv_fast_tanf(float %val)			%tan = tail call fastcc float @__nv_fast_tanf(float %val)
	Show All 30 Lines

llvm/test/CodeGen/NVPTX/reg-copy.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-unknown-unknown"			target triple = "nvptx64-unknown-unknown"

	define void @PR24303(float* %f) {			define void @PR24303(float* %f) {
	; CHECK-LABEL: .visible .entry PR24303(			; CHECK-LABEL: .visible .entry PR24303(
	; Do not use mov.f or mov.u to convert between float and int.			; Do not use mov.f or mov.u to convert between float and int.
	; CHECK-NOT: mov.{{f\|u}}{{32\|64}} %f{{[0-9]+}}, %r{{[0-9]+}}			; CHECK-NOT: mov.{{f\|u}}{{32\|64}} %f{{[0-9]+}}, %r{{[0-9]+}}
	▲ Show 20 Lines • Show All 215 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/reg-types.ll

	; Verify register types we generate in PTX.			; Verify register types we generate in PTX.
	; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
	; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s -check-prefixes=NO8BIT			; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s -check-prefixes=NO8BIT
	; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s -check-prefixes=NO8BIT			; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s -check-prefixes=NO8BIT
				; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK-LABEL: .visible .func func()			; CHECK-LABEL: .visible .func func()
	; NO8BIT-LABEL: .visible .func func()			; NO8BIT-LABEL: .visible .func func()
	define void @func() {			define void @func() {
	entry:			entry:
	%s8 = alloca i8, align 1			%s8 = alloca i8, align 1
	%u8 = alloca i8, align 1			%u8 = alloca i8, align 1
	%s16 = alloca i16, align 2			%s16 = alloca i16, align 2
	▲ Show 20 Lines • Show All 56 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/rotate.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck --check-prefix=SM20 %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck --check-prefix=SM20 %s
	; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck --check-prefix=SM35 %s			; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck --check-prefix=SM35 %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 \| %ptxas-verify -arch=sm_35 %}


	declare i32 @llvm.nvvm.rotate.b32(i32, i32)			declare i32 @llvm.nvvm.rotate.b32(i32, i32)
	declare i64 @llvm.nvvm.rotate.b64(i64, i32)			declare i64 @llvm.nvvm.rotate.b64(i64, i32)
	declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)			declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)

	; SM20: rotate32			; SM20: rotate32
	; SM35: rotate32			; SM35: rotate32
	▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/rotate_64.ll

	; RUN: llc < %s -march=nvptx \| FileCheck %s			; RUN: llc < %s -march=nvptx \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx \| %ptxas-verify %}

	declare i64 @llvm.nvvm.rotate.b64(i64, i32)			declare i64 @llvm.nvvm.rotate.b64(i64, i32)
	declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)			declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)

	; CHECK: rotate64			; CHECK: rotate64
	define i64 @rotate64(i64 %a, i32 %b) {			define i64 @rotate64(i64 %a, i32 %b) {
	; CHECK: shl.b64 [[LHS:%.]], [[RD1:%.]], 3;			; CHECK: shl.b64 [[LHS:%.]], [[RD1:%.]], 3;
	; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 61;			; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 61;
	Show All 15 Lines

llvm/test/CodeGen/NVPTX/sched1.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; Ensure source scheduling is working			; Ensure source scheduling is working

	define void @foo(i32* %a) {			define void @foo(i32* %a) {
	; CHECK: .func foo			; CHECK: .func foo
	; CHECK: ld.u32			; CHECK: ld.u32
	; CHECK-NEXT: ld.u32			; CHECK-NEXT: ld.u32
	; CHECK-NEXT: ld.u32			; CHECK-NEXT: ld.u32
	Show All 22 Lines

llvm/test/CodeGen/NVPTX/sched2.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	define void @foo(<2 x i32>* %a) {			define void @foo(<2 x i32>* %a) {
	; CHECK: .func foo			; CHECK: .func foo
	; CHECK: ld.v2.u32			; CHECK: ld.v2.u32
	; CHECK-NEXT: ld.v2.u32			; CHECK-NEXT: ld.v2.u32
	; CHECK-NEXT: ld.v2.u32			; CHECK-NEXT: ld.v2.u32
	; CHECK-NEXT: ld.v2.u32			; CHECK-NEXT: ld.v2.u32
	; CHECK-NEXT: add.s32			; CHECK-NEXT: add.s32
	Show All 23 Lines

llvm/test/CodeGen/NVPTX/sext-in-reg.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"


	define void @one(i64 %a, i64 %b, i64* %p1, i64* %p2) {			define void @one(i64 %a, i64 %b, i64* %p1, i64* %p2) {
	; CHECK: cvt.s64.s8			; CHECK: cvt.s64.s8
	; CHECK: cvt.s64.s8			; CHECK: cvt.s64.s8
	entry:			entry:
	▲ Show 20 Lines • Show All 102 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/sext-params.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"


	define i8 @foo(i8 signext %a) {			define i8 @foo(i8 signext %a) {
	; CHECK: ld.param.s8			; CHECK: ld.param.s8
	%ret = add i8 %a, 3			%ret = add i8 %a, 3
	ret i8 %ret			ret i8 %ret
	}			}

	define i8 @bar(i8 zeroext %a) {			define i8 @bar(i8 zeroext %a) {
	; CHECK: ld.param.u8			; CHECK: ld.param.u8
	%ret = add i8 %a, 3			%ret = add i8 %a, 3
	ret i8 %ret			ret i8 %ret
	}			}

llvm/test/CodeGen/NVPTX/shfl-p.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	declare {i32, i1} @llvm.nvvm.shfl.down.i32p(i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.down.i32p(i32, i32, i32)
	declare {float, i1} @llvm.nvvm.shfl.down.f32p(float, i32, i32)			declare {float, i1} @llvm.nvvm.shfl.down.f32p(float, i32, i32)
	declare {i32, i1} @llvm.nvvm.shfl.up.i32p(i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.up.i32p(i32, i32, i32)
	declare {float, i1} @llvm.nvvm.shfl.up.f32p(float, i32, i32)			declare {float, i1} @llvm.nvvm.shfl.up.f32p(float, i32, i32)
	declare {i32, i1} @llvm.nvvm.shfl.bfly.i32p(i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.bfly.i32p(i32, i32, i32)
	declare {float, i1} @llvm.nvvm.shfl.bfly.f32p(float, i32, i32)			declare {float, i1} @llvm.nvvm.shfl.bfly.f32p(float, i32, i32)
	declare {i32, i1} @llvm.nvvm.shfl.idx.i32p(i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.idx.i32p(i32, i32, i32)
	▲ Show 20 Lines • Show All 163 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/shfl-sync-p.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32)
	declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32)			declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32)
	declare {i32, i1} @llvm.nvvm.shfl.sync.up.i32p(i32, i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.sync.up.i32p(i32, i32, i32, i32)
	declare {float, i1} @llvm.nvvm.shfl.sync.up.f32p(i32, float, i32, i32)			declare {float, i1} @llvm.nvvm.shfl.sync.up.f32p(i32, float, i32, i32)
	declare {i32, i1} @llvm.nvvm.shfl.sync.bfly.i32p(i32, i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.sync.bfly.i32p(i32, i32, i32, i32)
	declare {float, i1} @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32)			declare {float, i1} @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32)
	declare {i32, i1} @llvm.nvvm.shfl.sync.idx.i32p(i32, i32, i32, i32)			declare {i32, i1} @llvm.nvvm.shfl.sync.idx.i32p(i32, i32, i32, i32)
	▲ Show 20 Lines • Show All 171 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/shfl-sync.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32)			declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32)
	declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32)			declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32)
	declare i32 @llvm.nvvm.shfl.sync.up.i32(i32, i32, i32, i32)			declare i32 @llvm.nvvm.shfl.sync.up.i32(i32, i32, i32, i32)
	declare float @llvm.nvvm.shfl.sync.up.f32(float, i32, i32, i32)			declare float @llvm.nvvm.shfl.sync.up.f32(float, i32, i32, i32)
	declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32)			declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32)
	declare float @llvm.nvvm.shfl.sync.bfly.f32(float, i32, i32, i32)			declare float @llvm.nvvm.shfl.sync.bfly.f32(float, i32, i32, i32)
	declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32)			declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32)
	▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/shfl.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)			declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
	declare float @llvm.nvvm.shfl.down.f32(float, i32, i32)			declare float @llvm.nvvm.shfl.down.f32(float, i32, i32)
	declare i32 @llvm.nvvm.shfl.up.i32(i32, i32, i32)			declare i32 @llvm.nvvm.shfl.up.i32(i32, i32, i32)
	declare float @llvm.nvvm.shfl.up.f32(float, i32, i32)			declare float @llvm.nvvm.shfl.up.f32(float, i32, i32)
	declare i32 @llvm.nvvm.shfl.bfly.i32(i32, i32, i32)			declare i32 @llvm.nvvm.shfl.bfly.i32(i32, i32, i32)
	declare float @llvm.nvvm.shfl.bfly.f32(float, i32, i32)			declare float @llvm.nvvm.shfl.bfly.f32(float, i32, i32)
	declare i32 @llvm.nvvm.shfl.idx.i32(i32, i32, i32)			declare i32 @llvm.nvvm.shfl.idx.i32(i32, i32, i32)
	▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/shift-parts.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: shift_parts_left_128			; CHECK: shift_parts_left_128
	define void @shift_parts_left_128(i128* %val, i128* %amtptr) {			define void @shift_parts_left_128(i128* %val, i128* %amtptr) {
	; CHECK: shl.b64			; CHECK: shl.b64
	; CHECK: mov.u32			; CHECK: mov.u32
	; CHECK: sub.s32			; CHECK: sub.s32
	; CHECK: shr.u64			; CHECK: shr.u64
	; CHECK: or.b64			; CHECK: or.b64
	Show All 29 Lines

llvm/test/CodeGen/NVPTX/simple-call.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	; CHECK: .func ({{.*}}) device_func			; CHECK: .func ({{.*}}) device_func
	define float @device_func(float %a) noinline {			define float @device_func(float %a) noinline {
	%ret = fmul float %a, %a			%ret = fmul float %a, %a
	ret float %ret			ret float %ret
	}			}

	; CHECK: .entry kernel_func			; CHECK: .entry kernel_func
	Show All 14 Lines

llvm/test/CodeGen/NVPTX/sqrt-approx.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
	; RUN: \| FileCheck %s			; RUN: \| FileCheck %s
				; RUN: %if ptxas %{ \
				; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
				; RUN: \| %ptxas-verify \
				; RUN: %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	declare float @llvm.sqrt.f32(float)			declare float @llvm.sqrt.f32(float)
	declare double @llvm.sqrt.f64(double)			declare double @llvm.sqrt.f64(double)

	; -- reciprocal sqrt --			; -- reciprocal sqrt --

	▲ Show 20 Lines • Show All 196 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/st-addrspace.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G32,LS32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G32,LS32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G64,LS64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefixes=ALL,G64,LS64
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr \| FileCheck %s --check-prefixes=G64,LS32			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr \| FileCheck %s --check-prefixes=G64,LS32
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr \| %ptxas-verify %}

	;; i8			;; i8
	; ALL-LABEL: st_global_i8			; ALL-LABEL: st_global_i8
	define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {			define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
	; G32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}			; G32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
	; G64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}			; G64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
	; ALL: ret			; ALL: ret
	store i8 %a, i8 addrspace(1)* %ptr			store i8 %a, i8 addrspace(1)* %ptr
	▲ Show 20 Lines • Show All 148 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/st-generic.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX32
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s --check-prefix=PTX64
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	;; i8			;; i8

	define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {			define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
	; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}			; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
	; PTX32: ret			; PTX32: ret
	; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}			; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
	; PTX64: ret			; PTX64: ret
	▲ Show 20 Lines • Show All 58 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/store-retval.ll

	; RUN: llc < %s --mtriple=nvptx-unknown-unknown \| FileCheck %s			; RUN: llc < %s --mtriple=nvptx-unknown-unknown \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s --mtriple=nvptx-unknown-unknown \| %ptxas-verify %}
	;			;
	; This is IR generated with clang using -O3 optimization level			; This is IR generated with clang using -O3 optimization level
	; and nvptx-unknown-unknown target from the following C code.			; and nvptx-unknown-unknown target from the following C code.
	;			;
	; struct StNoalign { unsigned int field[5]; };			; struct StNoalign { unsigned int field[5]; };
	; struct StAlign8 { _Alignas(8) unsigned int field[5]; };			; struct StAlign8 { _Alignas(8) unsigned int field[5]; };
	; struct StAlign16 { _Alignas(16) unsigned int field[5]; };			; struct StAlign16 { _Alignas(16) unsigned int field[5]; };
	;			;
	▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/surf-read-cuda.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20
	; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30			; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	target triple = "nvptx-unknown-cuda"			target triple = "nvptx-unknown-cuda"

	declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)			declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
	declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)			declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)


	; SM20-LABEL: .entry foo			; SM20-LABEL: .entry foo
	▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/surf-read.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target triple = "nvptx-unknown-nvcl"			target triple = "nvptx-unknown-nvcl"

	declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)			declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)

	; CHECK: .entry foo			; CHECK: .entry foo
	define void @foo(i64 %img, float* %red, i32 %idx) {			define void @foo(i64 %img, float* %red, i32 %idx) {
	; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]			; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
	Show All 11 Lines

llvm/test/CodeGen/NVPTX/surf-tex.py

	# RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll			# RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll
	# RUN: llc %t-cuda.ll -verify-machineinstrs -o - \| FileCheck %t-cuda.ll			# RUN: llc %t-cuda.ll -verify-machineinstrs -o - \| FileCheck %t-cuda.ll
				# RUN: %if ptxas %{ llc %t-cuda.ll -verify-machineinstrs -o - \| %ptxas-verify %}

	# We only need to run this second time for texture tests, because			# We only need to run this second time for texture tests, because
	# there is a difference between unified and non-unified intrinsics.			# there is a difference between unified and non-unified intrinsics.
	#			#
	# RUN: %python %s --target=nvcl --tests=suld,sust,tex,tld4 --gen-list-append --gen-list=%t.list > %t-nvcl.ll			# RUN: %python %s --target=nvcl --tests=suld,sust,tex,tld4 --gen-list-append --gen-list=%t.list > %t-nvcl.ll
	# RUN: llc %t-nvcl.ll -verify-machineinstrs -o - \| FileCheck %t-nvcl.ll			# RUN: llc %t-nvcl.ll -verify-machineinstrs -o - \| FileCheck %t-nvcl.ll
				# RUN: %if ptxas %{ llc %t-nvcl.ll -verify-machineinstrs -o - \| %ptxas-verify %}

	# Verify that all instructions and intrinsics defined in TableGen			# Verify that all instructions and intrinsics defined in TableGen
	# files are tested. The command may fail if the files are changed			# files are tested. The command may fail if the files are changed
	# significantly and we can no longer find names of intrinsics or			# significantly and we can no longer find names of intrinsics or
	# instructions. In that case we can replace this command with a			# instructions. In that case we can replace this command with a
	# reference list.			# reference list.
	#			#
	# Verification is turned off by default to avoid issues when the LLVM			# Verification is turned off by default to avoid issues when the LLVM
	▲ Show 20 Lines • Show All 1,009 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/surf-write-cuda.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20
	; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30			; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	target triple = "nvptx-unknown-cuda"			target triple = "nvptx-unknown-cuda"

	declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)			declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
	declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)			declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)


	; SM20-LABEL: .entry foo			; SM20-LABEL: .entry foo
	Show All 32 Lines

llvm/test/CodeGen/NVPTX/surf-write.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target triple = "nvptx-unknown-nvcl"			target triple = "nvptx-unknown-nvcl"

	declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)			declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)

	; CHECK: .entry foo			; CHECK: .entry foo
	define void @foo(i64 %img, i32 %val, i32 %idx) {			define void @foo(i64 %img, i32 %val, i32 %idx) {
	; CHECK: sust.b.1d.b32.trap [foo_param_0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}			; CHECK: sust.b.1d.b32.trap [foo_param_0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
	tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)			tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
	ret void			ret void
	}			}

	!nvvm.annotations = !{!1, !2}			!nvvm.annotations = !{!1, !2}
	!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}			!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}
	!2 = !{void (i64, i32, i32)* @foo, !"wroimage", i32 0}			!2 = !{void (i64, i32, i32)* @foo, !"wroimage", i32 0}

llvm/test/CodeGen/NVPTX/symbol-naming.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	; Verify that the NVPTX target removes invalid symbol names prior to emitting			; Verify that the NVPTX target removes invalid symbol names prior to emitting
	; PTX.			; PTX.

	; CHECK-NOT: .str			; CHECK-NOT: .str
	; CHECK-NOT: .function.			; CHECK-NOT: .function.

	; CHECK-DAG: _$_str			; CHECK-DAG: _$_str
	Show All 36 Lines

llvm/test/CodeGen/NVPTX/tex-read-cuda.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20
	; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30			; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}


	target triple = "nvptx-unknown-cuda"			target triple = "nvptx-unknown-cuda"

	declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)			declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
	declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)			declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)

	; SM20-LABEL: .entry foo			; SM20-LABEL: .entry foo
	▲ Show 20 Lines • Show All 64 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/tex-read.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target triple = "nvptx-unknown-nvcl"			target triple = "nvptx-unknown-nvcl"

	declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)			declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)

	; CHECK: .entry foo			; CHECK: .entry foo
	define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {			define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
	; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]			; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
	Show All 11 Lines

llvm/test/CodeGen/NVPTX/texsurf-queries.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s --check-prefix=SM20
	; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30			; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s --check-prefix=SM30
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	target triple = "nvptx-unknown-cuda"			target triple = "nvptx-unknown-cuda"

	@tex0 = internal addrspace(1) global i64 0, align 8			@tex0 = internal addrspace(1) global i64 0, align 8
	@surf0 = internal addrspace(1) global i64 0, align 8			@surf0 = internal addrspace(1) global i64 0, align 8

	declare i32 @llvm.nvvm.txq.width(i64)			declare i32 @llvm.nvvm.txq.width(i64)
	declare i32 @llvm.nvvm.txq.height(i64)			declare i32 @llvm.nvvm.txq.height(i64)
	▲ Show 20 Lines • Show All 93 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/tid-range.ll

	; RUN: llc < %s -march=nvptx64 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 \| %ptxas-verify %}

	declare i32 @get_register()			declare i32 @get_register()

	define i1 @test1() {			define i1 @test1() {
	entry:			entry:
	%call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !0			%call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !0
	%cmp = icmp eq i32 %call, 1			%cmp = icmp eq i32 %call, 1
	ret i1 %cmp			ret i1 %cmp
	}			}
	Show All 9 Lines

llvm/test/CodeGen/NVPTX/tuple-literal.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20			; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ \| %ptxas-verify %}

	define ptx_device void @test_function({i8, i8}*) {			define ptx_device void @test_function({i8, i8}*) {
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/vec-param-load.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

	define <16 x float> @test_v16f32(<16 x float> %a) {			define <16 x float> @test_v16f32(<16 x float> %a) {
	; CHECK-LABEL: test_v16f32(			; CHECK-LABEL: test_v16f32(
	; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];			; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
	; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];			; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
	; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];			; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
	▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/vec8.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	target triple = "nvptx-unknown-cuda"			target triple = "nvptx-unknown-cuda"

	; CHECK: .visible .func foo			; CHECK: .visible .func foo
	define void @foo(<8 x i8> %a, i8* %b) {			define void @foo(<8 x i8> %a, i8* %b) {
	; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]			; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
	; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]			; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
	; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1]			; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1]
	Show All 9 Lines

llvm/test/CodeGen/NVPTX/vector-args.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	define float @foo(<2 x float> %a) {			define float @foo(<2 x float> %a) {
	; CHECK: .func (.param .b32 func_retval0) foo			; CHECK: .func (.param .b32 func_retval0) foo
	; CHECK: .param .align 8 .b8 foo_param_0[8]			; CHECK: .param .align 8 .b8 foo_param_0[8]
	; CHECK: ld.param.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}			; CHECK: ld.param.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
	%t1 = fmul <2 x float> %a, %a			%t1 = fmul <2 x float> %a, %a
	%t2 = extractelement <2 x float> %t1, i32 0			%t2 = extractelement <2 x float> %t1, i32 0
	%t3 = extractelement <2 x float> %t1, i32 1			%t3 = extractelement <2 x float> %t1, i32 1
	Show All 25 Lines

llvm/test/CodeGen/NVPTX/vector-call.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs \| %ptxas-verify %}

	target triple = "nvptx-unknown-cuda"			target triple = "nvptx-unknown-cuda"

	declare void @bar(<4 x i32>)			declare void @bar(<4 x i32>)

	; CHECK-LABEL: .func foo(			; CHECK-LABEL: .func foo(
	; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];			; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
	; CHECK: .param .align 16 .b8 param0[16];			; CHECK: .param .align 16 .b8 param0[16];
	Show All 21 Lines

llvm/test/CodeGen/NVPTX/vector-compare.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20			; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ \| %ptxas-verify %}
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 %if ptxas %{ \| %ptxas-verify %}

	; This test makes sure that the result of vector compares are properly			; This test makes sure that the result of vector compares are properly
	; scalarized. If codegen fails, then the type legalizer incorrectly			; scalarized. If codegen fails, then the type legalizer incorrectly
	; tried to promote <2 x i1> to <2 x i8> and instruction selection failed.			; tried to promote <2 x i1> to <2 x i8> and instruction selection failed.

	define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) {			define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) {
	%aval = load <2 x i32>, <2 x i32>* %a			%aval = load <2 x i32>, <2 x i32>* %a
	%bval = load <2 x i32>, <2 x i32>* %b			%bval = load <2 x i32>, <2 x i32>* %b
	Show All 9 Lines

llvm/test/CodeGen/NVPTX/vector-global.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 \| %ptxas-verify %}

	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"			target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	@g1 = external global <4 x i32> ; external global variable			@g1 = external global <4 x i32> ; external global variable
	; CHECK: .extern .global .align 16 .b8 g1[16];			; CHECK: .extern .global .align 16 .b8 g1[16];
	@g2 = global <4 x i32> zeroinitializer ; module-level global variable			@g2 = global <4 x i32> zeroinitializer ; module-level global variable
	; CHECK: .visible .global .align 16 .b8 g2[16];			; CHECK: .visible .global .align 16 .b8 g2[16];

llvm/test/CodeGen/NVPTX/vector-loads.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; Even though general vector types are not supported in PTX, we can still			; Even though general vector types are not supported in PTX, we can still
	; optimize loads/stores with pseudo-vector instructions of the form:			; optimize loads/stores with pseudo-vector instructions of the form:
	;			;
	; ld.v2.f32 {%f0, %f1}, [%r0]			; ld.v2.f32 {%f0, %f1}, [%r0]
	;			;
	; which will load two floats at once into scalar registers.			; which will load two floats at once into scalar registers.

	▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/vector-select.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20			; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ \| %ptxas-verify %}
	; RUN: llc < %s -march=nvptx64 -mcpu=sm_20			; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 %if ptxas %{ \| %ptxas-verify %}

	; This test makes sure that vector selects are scalarized by the type legalizer.			; This test makes sure that vector selects are scalarized by the type legalizer.
	; If not, type legalization will fail.			; If not, type legalization will fail.

	define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) {			define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) {
	entry:			entry:
	%tmp4 = load <2 x i32>, <2 x i32> addrspace(1)* %def_a			%tmp4 = load <2 x i32>, <2 x i32> addrspace(1)* %def_a
	%tmp6 = load <2 x i32>, <2 x i32> addrspace(1)* %def_c			%tmp6 = load <2 x i32>, <2 x i32> addrspace(1)* %def_c
	%tmp8 = load <2 x i32>, <2 x i32> addrspace(1)* %def_b			%tmp8 = load <2 x i32>, <2 x i32> addrspace(1)* %def_b
	%0 = icmp sge <2 x i32> %tmp4, zeroinitializer			%0 = icmp sge <2 x i32> %tmp4, zeroinitializer
	%cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8			%cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
	store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c			store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c
	ret void			ret void
	}			}

llvm/test/CodeGen/NVPTX/vector-stores.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: .visible .func foo1			; CHECK: .visible .func foo1
	; CHECK: st.v2.f32			; CHECK: st.v2.f32
	define void @foo1(<2 x float> %val, <2 x float>* %ptr) {			define void @foo1(<2 x float> %val, <2 x float>* %ptr) {
	store <2 x float> %val, <2 x float>* %ptr			store <2 x float> %val, <2 x float>* %ptr
	ret void			ret void
	}			}

	Show All 21 Lines

llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll

	; RUN: llc < %s \| FileCheck %s			; RUN: llc < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s \| %ptxas-verify %}

	target triple = "nvptx64-nvidia-cuda"			target triple = "nvptx64-nvidia-cuda"

	; CHECK-LABEL: test1			; CHECK-LABEL: test1
	; CHECK: ld.global.v2.f32			; CHECK: ld.global.v2.f32
	; CHECK: ld.global.v2.f32			; CHECK: ld.global.v2.f32
	; CHECK: st.global.v2.f32			; CHECK: st.global.v2.f32
	; CHECK: st.global.v2.f32			; CHECK: st.global.v2.f32
	define void @test1(float addrspace(1)* noalias align 8 %in, float addrspace(1)* noalias align 8 %out) {			define void @test1(float addrspace(1)* noalias align 8 %in, float addrspace(1)* noalias align 8 %out) {
	Show All 20 Lines

llvm/test/CodeGen/NVPTX/vote.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	declare i1 @llvm.nvvm.vote.all(i1)			declare i1 @llvm.nvvm.vote.all(i1)
	; CHECK-LABEL: .func{{.*}}vote_all			; CHECK-LABEL: .func{{.*}}vote_all
	define i1 @vote_all(i1 %pred) {			define i1 @vote_all(i1 %pred) {
	; CHECK: vote.all.pred			; CHECK: vote.all.pred
	%val = call i1 @llvm.nvvm.vote.all(i1 %pred)			%val = call i1 @llvm.nvvm.vote.all(i1 %pred)
	ret i1 %val			ret i1 %val
	}			}
	▲ Show 20 Lines • Show All 56 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/weak-global.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: .weak .global .align 4 .u32 g			; CHECK: .weak .global .align 4 .u32 g
	@g = common addrspace(1) global i32 zeroinitializer			@g = common addrspace(1) global i32 zeroinitializer

	define i32 @func0() {			define i32 @func0() {
	%val = load i32, i32 addrspace(1)* @g			%val = load i32, i32 addrspace(1)* @g
	ret i32 %val			ret i32 %val
	}			}

llvm/test/CodeGen/NVPTX/weak-linkage.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s			; RUN: llc < %s -march=nvptx -mcpu=sm_20 \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 \| %ptxas-verify %}

	; CHECK: // .weak foo			; CHECK: // .weak foo
	; CHECK: .weak .func foo			; CHECK: .weak .func foo
	define weak void @foo() {			define weak void @foo() {
	ret void			ret void
	}			}

	; CHECK: // .weak baz			; CHECK: // .weak baz
	Show All 9 Lines

llvm/test/CodeGen/NVPTX/wmma.py

	# This test generates all variants of wmma intrinsics and verifies that LLVM			# This test generates all variants of wmma intrinsics and verifies that LLVM
	# generates correct instructions for them.			# generates correct instructions for them.

	# Check all variants of instructions supported by PTX60 on SM70			# Check all variants of instructions supported by PTX60 on SM70
	# RUN: %python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll			# RUN: %python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll
	# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \			# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
	# RUN: --check-prefixes=INTRINSICS,M16N16			# RUN: --check-prefixes=INTRINSICS,M16N16
	# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \			# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
	# RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX			# RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
	# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \			# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
	# RUN: \| FileCheck %t-ptx60-sm_70.ll			# RUN: \| FileCheck %t-ptx60-sm_70.ll
				# RUN: %if ptxas %{ \
				# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
				# RUN: \| %ptxas-verify -arch=sm_70 \
				# RUN: %}

	# Check all variants of instructions supported by PTX61 on SM70			# Check all variants of instructions supported by PTX61 on SM70
	# RUN: %python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll			# RUN: %python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
	# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \			# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
	# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM			# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM
	# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \			# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
	# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX			# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
	# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \			# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
	# RUN: \| FileCheck %t-ptx61-sm_70.ll			# RUN: \| FileCheck %t-ptx61-sm_70.ll
				# RUN: %if ptxas-9.1 %{ \
				# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
				# RUN: \| %ptxas-verify -arch=sm_70 \
				# RUN: %}

	# Check all variants of instructions supported by PTX63 on SM72			# Check all variants of instructions supported by PTX63 on SM72
	# RUN: %python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll			# RUN: %python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
	# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \			# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
	# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT			# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT
	# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \			# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
	# RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX			# RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
	# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \			# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
	# RUN: \| FileCheck %t-ptx63-sm_72.ll			# RUN: \| FileCheck %t-ptx63-sm_72.ll
				# RUN: %if ptxas-10.0 %{ \
				# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
				# RUN: \| %ptxas-verify -arch=sm_72 \
				# RUN: %}

	# Check all variants of instructions supported by PTX63 on SM75			# Check all variants of instructions supported by PTX63 on SM75
	# RUN: %python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll			# RUN: %python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
	# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \			# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
	# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT			# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT
	# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \			# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
	# RUN: --check-prefixes=INTRINSICS,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX			# RUN: --check-prefixes=INTRINSICS,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
	# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \			# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
	# RUN: \| FileCheck %t-ptx63-sm_75.ll			# RUN: \| FileCheck %t-ptx63-sm_75.ll
				# RUN: %if ptxas-10.0 %{ \
				# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
				# RUN: \| %ptxas-verify -arch=sm_75 \
				# RUN: %}

	# Check all variants of instructions supported by PTX64 on SM70+			# Check all variants of instructions supported by PTX64 on SM70+
	# RUN: %python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll			# RUN: %python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll
	# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \			# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
	# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA			# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA
	# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \			# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
	# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NODOUBLE,NOALTFLOAT,NOLDMATRIX			# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NODOUBLE,NOALTFLOAT,NOLDMATRIX
	# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \			# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
	# RUN: \| FileCheck %t-ptx64-sm_70.ll			# RUN: \| FileCheck %t-ptx64-sm_70.ll
				# RUN: %if ptxas-10.1 %{ \
				# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
				# RUN: \| %ptxas-verify -arch=sm_70 \
				# RUN: %}

	# Check all variants of instructions supported by PTX65 on SM75+			# Check all variants of instructions supported by PTX65 on SM75+
	# RUN: %python %s --ptx=65 --gpu-arch=75 > %t-ptx65-sm_75.ll			# RUN: %python %s --ptx=65 --gpu-arch=75 > %t-ptx65-sm_75.ll
	# RUN: FileCheck %t-ptx65-sm_75.ll < %t-ptx65-sm_75.ll \			# RUN: FileCheck %t-ptx65-sm_75.ll < %t-ptx65-sm_75.ll \
	# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT,MMA,PTX65MMA,PTX65LDMATRIX			# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT,MMA,PTX65MMA,PTX65LDMATRIX
	# RUN: FileCheck %t-ptx65-sm_75.ll < %t-ptx65-sm_75.ll \			# RUN: FileCheck %t-ptx65-sm_75.ll < %t-ptx65-sm_75.ll \
	# RUN: --check-prefixes=INTRINSICS			# RUN: --check-prefixes=INTRINSICS
	# RUN: llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \			# RUN: llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \
	# RUN: \| FileCheck %t-ptx65-sm_75.ll			# RUN: \| FileCheck %t-ptx65-sm_75.ll
				# RUN: %if ptxas-10.2 %{ \
				# RUN: llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \
				# RUN: \| %ptxas-verify -arch=sm_75 \
				# RUN: %}

	# Check all variants of instructions supported by PTX71 on SM80+			# Check all variants of instructions supported by PTX71 on SM80+
	# RUN: %python %s --ptx=71 --gpu-arch=80 > %t-ptx71-sm_80.ll			# RUN: %python %s --ptx=71 --gpu-arch=80 > %t-ptx71-sm_80.ll
	# RUN: FileCheck %t-ptx71-sm_80.ll < %t-ptx71-sm_80.ll \			# RUN: FileCheck %t-ptx71-sm_80.ll < %t-ptx71-sm_80.ll \
	# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT,MMA,ALTFLOAT,DOUBLE,PTX65MMA,PTX65LDMATRIX,PTX71MMA			# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT,MMA,ALTFLOAT,DOUBLE,PTX65MMA,PTX65LDMATRIX,PTX71MMA
	# RUN: FileCheck %t-ptx71-sm_80.ll < %t-ptx71-sm_80.ll \			# RUN: FileCheck %t-ptx71-sm_80.ll < %t-ptx71-sm_80.ll \
	# RUN: --check-prefixes=INTRINSICS			# RUN: --check-prefixes=INTRINSICS
	# RUN: llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \			# RUN: llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \
	# RUN: \| FileCheck %t-ptx71-sm_80.ll			# RUN: \| FileCheck %t-ptx71-sm_80.ll
				# RUN: %if ptxas-11.1 %{ \
				# RUN: llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \
				# RUN: \| %ptxas-verify -arch=sm_80 \
				# RUN: %}

	from __future__ import print_function			from __future__ import print_function

	import argparse			import argparse
	from itertools import product			from itertools import product
	from string import Template			from string import Template

	class MMAType:			class MMAType:
	▲ Show 20 Lines • Show All 903 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/zeroext-32bit.ll

	; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs \| %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}

	; The zeroext attribute below should be silently ignored because			; The zeroext attribute below should be silently ignored because
	; we can pass a 32-bit integer across a function call without			; we can pass a 32-bit integer across a function call without
	; needing to extend it.			; needing to extend it.

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
	target triple = "nvptx64-unknown-cuda"			target triple = "nvptx64-unknown-cuda"

	Show All 17 Lines

llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda %if ptxas %{ \| %ptxas-verify %}

	; Don't crash for a function w/o debug info that contains an instruction w/			; Don't crash for a function w/o debug info that contains an instruction w/
	; debug info.			; debug info.
	; Reported as #51079			; Reported as #51079

	define weak void @test() {			define weak void @test() {
	ret void, !dbg !10			ret void, !dbg !10
	}			}

	Show All 18 Lines

llvm/test/DebugInfo/NVPTX/cu-range-hole.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; CHECK: .target sm_{{[0-9]+}}, debug			; CHECK: .target sm_{{[0-9]+}}, debug

	; CHECK: .visible .func (.param .b32 func_retval0) b(			; CHECK: .visible .func (.param .b32 func_retval0) b(
	; CHECK: .param .b32 b_param_0			; CHECK: .param .b32 b_param_0
	; CHECK: )			; CHECK: )
	; CHECK: {			; CHECK: {
	; CHECK: .loc 1 1 0			; CHECK: .loc 1 1 0
	▲ Show 20 Lines • Show All 288 Lines • Show Last 20 Lines

llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; CHECK: .target sm_20, debug			; CHECK: .target sm_20, debug

	; CHECK: .visible .func use_dbg_declare()			; CHECK: .visible .func use_dbg_declare()
	; CHECK: .local .align 8 .b8 __local_depot0[8];			; CHECK: .local .align 8 .b8 __local_depot0[8];
	; CHECK: mov.u64 %SPL, __local_depot0;			; CHECK: mov.u64 %SPL, __local_depot0;
	; CHECK: add.u64 %rd1, %SP, 0;			; CHECK: add.u64 %rd1, %SP, 0;
	; CHECK: .loc 1 5 3 // t.c:5:3			; CHECK: .loc 1 5 3 // t.c:5:3
	▲ Show 20 Lines • Show All 250 Lines • Show Last 20 Lines

llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll

	; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s \| FileCheck %s			; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s \| %ptxas-verify %}

	; Generated with -O1 from:			; Generated with -O1 from:
	; int f1();			; int f1();
	; void f2(int*);			; void f2(int*);
	; int f3(int);			; int f3(int);
	;			;
	; int foo() {			; int foo() {
	; int i = 3;			; int i = 3;
	; f3(i);			; f3(i);
	▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines

llvm/test/DebugInfo/NVPTX/debug-addr-class.ll

	; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s \| FileCheck %s			; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s \| %ptxas-verify %}

	@GLOBAL = addrspace(1) externally_initialized global i32 0, align 4, !dbg !0			@GLOBAL = addrspace(1) externally_initialized global i32 0, align 4, !dbg !0
	@SHARED = addrspace(3) externally_initialized global i32 undef, align 4, !dbg !6			@SHARED = addrspace(3) externally_initialized global i32 undef, align 4, !dbg !6

	define void @test(float, float, float, i32) !dbg !17 {			define void @test(float, float, float, i32) !dbg !17 {
	%5 = alloca float, align 4			%5 = alloca float, align 4
	%6 = alloca float*, align 8			%6 = alloca float*, align 8
	%7 = alloca float*, align 8			%7 = alloca float*, align 8
	▲ Show 20 Lines • Show All 348 Lines • Show Last 20 Lines

llvm/test/DebugInfo/NVPTX/debug-empty.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; CHECK: .target sm_{{[0-9]+$}}			; CHECK: .target sm_{{[0-9]+$}}
	; CHECK: .section .debug_loc { }			; CHECK: .section .debug_loc { }
	; CHECK-NOT: }			; CHECK-NOT: }

	!llvm.dbg.cu = !{!0}			!llvm.dbg.cu = !{!0}
	!llvm.module.flags = !{!3, !4, !5, !6, !7}			!llvm.module.flags = !{!3, !4, !5, !6, !7}
	!llvm.ident = !{!8}			!llvm.ident = !{!8}
	Show All 10 Lines

llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; // Bitcode in this test case is reduced version of compiled code below:			; // Bitcode in this test case is reduced version of compiled code below:
	;extern "C" {			;extern "C" {
	;#line 1 "/source/dir/foo.h"			;#line 1 "/source/dir/foo.h"
	;__device__ void foo() {}			;__device__ void foo() {}
	;#line 2 "/source/dir/bar.cu"			;#line 2 "/source/dir/bar.cu"
	;__device__ void bar() {}			;__device__ void bar() {}
	;}			;}
	Show All 39 Lines

llvm/test/DebugInfo/NVPTX/debug-file-loc.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; // Bitcode int this test case is reduced version of compiled code below:			; // Bitcode int this test case is reduced version of compiled code below:
	;extern "C" {			;extern "C" {
	;#line 1 "/source/dir/foo.h"			;#line 1 "/source/dir/foo.h"
	;__device__ void foo() {}			;__device__ void foo() {}
	;#line 2 "/source/dir/bar.cu"			;#line 2 "/source/dir/bar.cu"
	;__device__ void bar() {}			;__device__ void bar() {}
	;}			;}
	▲ Show 20 Lines • Show All 96 Lines • Show Last 20 Lines

llvm/test/DebugInfo/NVPTX/debug-info.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; // Bitcode int this test case is reduced version of compiled code below:			; // Bitcode int this test case is reduced version of compiled code below:
	;__device__ inline void res(float x, float y, float res) { res = x + y; }			;__device__ inline void res(float x, float y, float res) { res = x + y; }
	;			;
	;__global__ void saxpy(int n, float a, float x, float y) {			;__global__ void saxpy(int n, float a, float x, float y) {
	; int i = blockIdx.x * blockDim.x + threadIdx.x;			; int i = blockIdx.x * blockDim.x + threadIdx.x;
	; if (i < n)			; if (i < n)
	; res(a * x[i], y[i], &y[i]);			; res(a * x[i], y[i], &y[i]);
	▲ Show 20 Lines • Show All 9,138 Lines • Show Last 20 Lines

llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll

	; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s \| FileCheck %s			; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s \| FileCheck %s
				; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s \| %ptxas-verify %}

	; CHECK: .target sm_{{[0-9]+}}, debug			; CHECK: .target sm_{{[0-9]+}}, debug

	; CHECK: .extern .func (.param .b32 func_retval0) _ZN1A3fooEv			; CHECK: .extern .func (.param .b32 func_retval0) _ZN1A3fooEv
	; CHECK: (			; CHECK: (
	; CHECK: .param .b64 _ZN1A3fooEv_param_0			; CHECK: .param .b64 _ZN1A3fooEv_param_0
	; CHECK: )			; CHECK: )

	▲ Show 20 Lines • Show All 473 Lines • Show Last 20 Lines

llvm/test/DebugInfo/NVPTX/debug-name-table.ll

	; RUN: llc -mtriple=nvptx64-nvidia-cuda -mattr=+ptx75 < %s \| FileCheck %s			; RUN: llc -mtriple=nvptx64-nvidia-cuda -mattr=+ptx75 < %s \| FileCheck %s
				; RUN: %if ptxas-11.5 %{ llc -mtriple=nvptx64-nvidia-cuda -mattr=+ptx75 < %s \| %ptxas-verify %}

	; DICompileUnit without 'nameTableKind: None' results in			; DICompileUnit without 'nameTableKind: None' results in
	; debug_pubnames and debug_pubtypes sections in DWARF. These sections			; debug_pubnames and debug_pubtypes sections in DWARF. These sections
	; use labels and label expressions, and ptxas requires PTX v7.5 to			; use labels and label expressions, and ptxas requires PTX v7.5 to
	; support them.			; support them.

	; CHECK-LABEL: .section .debug_pubnames			; CHECK-LABEL: .section .debug_pubnames
	; CHECK-NEXT: {			; CHECK-NEXT: {
	Show All 35 Lines

llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck --check-prefix=CHECK-NODIR %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck --check-prefix=CHECK-NODIR %s
	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -dwarf-directory=1 \| FileCheck --check-prefix=CHECK-DIR %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -dwarf-directory=1 \| FileCheck --check-prefix=CHECK-DIR %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; CHECK-NODIR: .file {{[0-9]+}} "/tmp/dbginfo/a{{/\|\\\\}}a.cpp"			; CHECK-NODIR: .file {{[0-9]+}} "/tmp/dbginfo/a{{/\|\\\\}}a.cpp"
	;			;
	; ptxas does not support .file directory syntax, but it can still be			; ptxas does not support .file directory syntax, but it can still be
	; forced by -dwarf-directory=1			; forced by -dwarf-directory=1
	; CHECK-DIR: .file {{[0-9]+}} "/tmp/dbginfo/a" "a.cpp"			; CHECK-DIR: .file {{[0-9]+}} "/tmp/dbginfo/a" "a.cpp"

	define void @_Z4funcv() !dbg !4 {			define void @_Z4funcv() !dbg !4 {
	entry:			entry:
	ret void, !dbg !5			ret void, !dbg !5
	}			}

	!llvm.dbg.cu = !{!0}			!llvm.dbg.cu = !{!0}
	!llvm.module.flags = !{!8, !9}			!llvm.module.flags = !{!8, !9}

	!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)			!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2, nameTableKind: None)
	!1 = !DIFile(filename: "a.cpp", directory: "/tmp/dbginfo/a")			!1 = !DIFile(filename: "a.cpp", directory: "/tmp/dbginfo/a")
	!2 = !{}			!2 = !{}
	!4 = distinct !DISubprogram(name: "func", linkageName: "_Z4funcv", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !1, type: !6, retainedNodes: !2)			!4 = distinct !DISubprogram(name: "func", linkageName: "_Z4funcv", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !1, type: !6, retainedNodes: !2)
	!5 = !DILocation(line: 2, scope: !4)			!5 = !DILocation(line: 2, scope: !4)
	!6 = !DISubroutineType(types: !7)			!6 = !DISubroutineType(types: !7)
	!7 = !{null}			!7 = !{null}
	!8 = !{i32 2, !"Dwarf Version", i32 4}			!8 = !{i32 2, !"Dwarf Version", i32 4}
	!9 = !{i32 1, !"Debug Info Version", i32 3}			!9 = !{i32 1, !"Debug Info Version", i32 3}

llvm/test/DebugInfo/NVPTX/packed_bitfields.ll

	; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s			; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda \| %ptxas-verify %}

	; Produced at -O0 from:			; Produced at -O0 from:
	; struct {			; struct {
	; char : 3;			; char : 3;
	; char a : 6;			; char a : 6;
	; } __attribute__((__packed__)) b;			; } __attribute__((__packed__)) b;

	; Note that DWARF 2 counts bit offsets backwards from the high end of			; Note that DWARF 2 counts bit offsets backwards from the high end of
	Show All 34 Lines

llvm/test/lit.cfg.py

Show First 20 Lines • Show All 186 Lines • ▼ Show 20 Lines	tools.extend([
ToolSubst('OrcV2CBindingsBasicUsage', unresolved='ignore'),		ToolSubst('OrcV2CBindingsBasicUsage', unresolved='ignore'),
ToolSubst('OrcV2CBindingsAddObjectFile', unresolved='ignore'),		ToolSubst('OrcV2CBindingsAddObjectFile', unresolved='ignore'),
ToolSubst('OrcV2CBindingsRemovableCode', unresolved='ignore'),		ToolSubst('OrcV2CBindingsRemovableCode', unresolved='ignore'),
ToolSubst('OrcV2CBindingsReflectProcessSymbols', unresolved='ignore'),		ToolSubst('OrcV2CBindingsReflectProcessSymbols', unresolved='ignore'),
ToolSubst('OrcV2CBindingsLazy', unresolved='ignore'),		ToolSubst('OrcV2CBindingsLazy', unresolved='ignore'),
ToolSubst('OrcV2CBindingsVeryLazy', unresolved='ignore'),		ToolSubst('OrcV2CBindingsVeryLazy', unresolved='ignore'),
ToolSubst('dxil-dis', unresolved='ignore')])		ToolSubst('dxil-dis', unresolved='ignore')])

		# Find (major, minor) version of ptxas
		def ptxas_version(ptxas):
		ptxas_cmd = subprocess.Popen([ptxas, '--version'], stdout=subprocess.PIPE)
		ptxas_out = ptxas_cmd.stdout.read().decode('ascii')
		ptxas_cmd.wait()
		match = re.search('release (\d+)\.(\d+)', ptxas_out)
		if match:
		return (int(match.group(1)), int(match.group(2)))
		print('couldn\'t determine ptxas version')
		return None

		def enable_ptxas(ptxas_executable):
		version = ptxas_version(ptxas_executable)
		if version:
		# ptxas is supposed to be backward compatible with previous
		# versions, so add a feature for every known version prior to
		# the current one.
		ptxas_known_versions = [
		(9, 0), (9, 1), (9, 2),
		(10, 0), (10, 1), (10, 2),
		(11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
		]

		# ignore ptxas if its version is below the minimum supported
		# version
		min_version = ptxas_known_versions[0]
		if version[0] < min_version[0] or version[1] < min_version[1]:
		print(
		'Warning: ptxas version {}.{} is not supported'.format(
		version[0], version[1]))
		return

		for known_major, known_minor in ptxas_known_versions:
		if known_major <= version[0] and known_minor <= version[1]:
		config.available_features.add(
		'ptxas-{}.{}'.format(known_major, known_minor))

		config.available_features.add('ptxas')
		tools.extend([ToolSubst('%ptxas', ptxas_executable),
		ToolSubst('%ptxas-verify', '{} -c -o /dev/null -'.format(
		ptxas_executable))])

		ptxas_executable = \
		os.environ.get('LLVM_PTXAS_EXECUTABLE', None) or config.ptxas_executable
		if ptxas_executable:
		enable_ptxas(ptxas_executable)

llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)		llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)

# Targets		# Targets

config.targets = frozenset(config.targets_to_build.split())		config.targets = frozenset(config.targets_to_build.split())

for arch in config.targets_to_build.split():		for arch in config.targets_to_build.split():
config.available_features.add(arch.lower() + '-registered-target')		config.available_features.add(arch.lower() + '-registered-target')
▲ Show 20 Lines • Show All 228 Lines • Show Last 20 Lines

llvm/test/lit.site.cfg.py.in

	Show All 17 Lines
	config.gold_executable = "@GOLD_EXECUTABLE@"			config.gold_executable = "@GOLD_EXECUTABLE@"
	config.ld64_executable = "@LD64_EXECUTABLE@"			config.ld64_executable = "@LD64_EXECUTABLE@"
	config.osx_sysroot = path(r"@CMAKE_OSX_SYSROOT@")			config.osx_sysroot = path(r"@CMAKE_OSX_SYSROOT@")
	config.ocamlfind_executable = "@OCAMLFIND@"			config.ocamlfind_executable = "@OCAMLFIND@"
	config.have_ocamlopt = @HAVE_OCAMLOPT@			config.have_ocamlopt = @HAVE_OCAMLOPT@
	config.ocaml_flags = "@OCAMLFLAGS@"			config.ocaml_flags = "@OCAMLFLAGS@"
	config.include_go_tests = @LLVM_INCLUDE_GO_TESTS@			config.include_go_tests = @LLVM_INCLUDE_GO_TESTS@
	config.go_executable = "@GO_EXECUTABLE@"			config.go_executable = "@GO_EXECUTABLE@"
				config.ptxas_executable = "@PXTAS_EXECUTABLE@"
	config.enable_shared = @ENABLE_SHARED@			config.enable_shared = @ENABLE_SHARED@
	config.enable_assertions = @ENABLE_ASSERTIONS@			config.enable_assertions = @ENABLE_ASSERTIONS@
	config.targets_to_build = "@TARGETS_TO_BUILD@"			config.targets_to_build = "@TARGETS_TO_BUILD@"
	config.native_target = "@LLVM_NATIVE_ARCH@"			config.native_target = "@LLVM_NATIVE_ARCH@"
	config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')			config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
	config.host_os = "@HOST_OS@"			config.host_os = "@HOST_OS@"
	config.host_cc = "@HOST_CC@"			config.host_cc = "@HOST_CC@"
	config.host_cxx = "@HOST_CXX@"			config.host_cxx = "@HOST_CXX@"
	Show All 34 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[NVPTX] Integrate ptxas to LIT testsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 425748

llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll

llvm/test/CodeGen/NVPTX/MachineSink-call.ll

llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll

llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll

llvm/test/CodeGen/NVPTX/access-non-generic.ll

llvm/test/CodeGen/NVPTX/add-128bit.ll

llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll

llvm/test/CodeGen/NVPTX/addrspacecast.ll

llvm/test/CodeGen/NVPTX/aggr-param.ll

llvm/test/CodeGen/NVPTX/aggregate-return.ll

llvm/test/CodeGen/NVPTX/annotations.ll

llvm/test/CodeGen/NVPTX/arg-lowering.ll

llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll

llvm/test/CodeGen/NVPTX/arithmetic-int.ll

llvm/test/CodeGen/NVPTX/async-copy.ll

llvm/test/CodeGen/NVPTX/atomics-sm60.ll

llvm/test/CodeGen/NVPTX/atomics-with-scope.ll

llvm/test/CodeGen/NVPTX/atomics.ll

llvm/test/CodeGen/NVPTX/b52037.ll

llvm/test/CodeGen/NVPTX/barrier.ll

llvm/test/CodeGen/NVPTX/bfe.ll

llvm/test/CodeGen/NVPTX/branch-fold.ll

llvm/test/CodeGen/NVPTX/bug17709.ll

llvm/test/CodeGen/NVPTX/bug21465.ll

llvm/test/CodeGen/NVPTX/bug22246.ll

llvm/test/CodeGen/NVPTX/bug22322.ll

llvm/test/CodeGen/NVPTX/bug26185-2.ll

llvm/test/CodeGen/NVPTX/bug26185.ll

llvm/test/CodeGen/NVPTX/bug41651.ll

llvm/test/CodeGen/NVPTX/bypass-div.ll

llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll

llvm/test/CodeGen/NVPTX/callchain.ll

llvm/test/CodeGen/NVPTX/calling-conv.ll

llvm/test/CodeGen/NVPTX/calls-with-phi.ll

llvm/test/CodeGen/NVPTX/combine-min-max.ll

llvm/test/CodeGen/NVPTX/compare-int.ll

llvm/test/CodeGen/NVPTX/constant-vectors.ll

llvm/test/CodeGen/NVPTX/convert-fp.ll

llvm/test/CodeGen/NVPTX/convert-int-sm20.ll

llvm/test/CodeGen/NVPTX/convert-sm80.ll

llvm/test/CodeGen/NVPTX/ctlz.ll

llvm/test/CodeGen/NVPTX/ctpop.ll

llvm/test/CodeGen/NVPTX/cttz.ll

llvm/test/CodeGen/NVPTX/disable-opt.ll

llvm/test/CodeGen/NVPTX/div-ri.ll

llvm/test/CodeGen/NVPTX/divrem-combine.ll

llvm/test/CodeGen/NVPTX/envreg.ll

llvm/test/CodeGen/NVPTX/extloadv.ll

llvm/test/CodeGen/NVPTX/f16-ex2.ll

llvm/test/CodeGen/NVPTX/f16-instructions.ll

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

llvm/test/CodeGen/NVPTX/fast-math.ll

llvm/test/CodeGen/NVPTX/fma-assoc.ll

llvm/test/CodeGen/NVPTX/fma-disable.ll

llvm/test/CodeGen/NVPTX/fma.ll

llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll

llvm/test/CodeGen/NVPTX/fns.ll

llvm/test/CodeGen/NVPTX/fp-contract.ll

llvm/test/CodeGen/NVPTX/fp-literals.ll

llvm/test/CodeGen/NVPTX/fp16.ll

llvm/test/CodeGen/NVPTX/function-align.ll

llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll

llvm/test/CodeGen/NVPTX/global-addrspace.ll

llvm/test/CodeGen/NVPTX/global-ordering.ll

llvm/test/CodeGen/NVPTX/global-variable-big.ll

llvm/test/CodeGen/NVPTX/global-visibility.ll

llvm/test/CodeGen/NVPTX/globals_init.ll

llvm/test/CodeGen/NVPTX/globals_lowering.ll

llvm/test/CodeGen/NVPTX/half.ll

llvm/test/CodeGen/NVPTX/i1-global.ll

llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll

llvm/test/CodeGen/NVPTX/i1-param.ll

llvm/test/CodeGen/NVPTX/i128-global.ll

[NVPTX] Integrate ptxas to LIT tests
ClosedPublic