diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s | FileCheck -check-prefix=ENABLED %s ; RUN: llc -disable-nvptx-load-store-vectorizer < %s | FileCheck -check-prefix=DISABLED %s +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} +; RUN: %if ptxas %{ llc -disable-nvptx-load-store-vectorizer < %s | %ptxas-verify %} + target triple = "nvptx64-nvidia-cuda" ; Check that the load-store vectorizer is enabled by default for nvptx, and diff --git a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll --- a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll +++ b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s | FileCheck %s +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} + target triple = "nvptx64-nvidia-cuda" declare void @foo() diff --git a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll --- a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll +++ b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s | FileCheck %s +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} + target triple = "nvptx64-nvidia-cuda" declare void @llvm.nvvm.barrier0() diff --git a/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll b/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll --- a/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll +++ b/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll @@ -1,4 +1,5 @@ ; RUN: llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s | FileCheck %s +; RUN: %if ptxas %{ llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" declare void @foo() diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll --- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll +++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll @@ -2,6 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX ; RUN: opt -mtriple=nvptx-- < %s -S -infer-address-spaces | FileCheck %s --check-prefix IR ; RUN: opt -mtriple=nvptx64-- < %s -S -infer-address-spaces | FileCheck %s --check-prefix IR +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 @scalar = internal addrspace(3) global float 0.000000e+00, align 4 diff --git a/llvm/test/CodeGen/NVPTX/add-128bit.ll b/llvm/test/CodeGen/NVPTX/add-128bit.ll --- a/llvm/test/CodeGen/NVPTX/add-128bit.ll +++ b/llvm/test/CodeGen/NVPTX/add-128bit.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll --- a/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: .visible .global .align 4 .u32 g = 42; ; CHECK: .visible .global .align 4 .u32 g2 = generic(g); diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll --- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll @@ -1,6 +1,9 @@ ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32,G32 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64 +; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} ; ALL-LABEL: conv1 define i32 @conv1(i32 addrspace(1)* %ptr) { diff --git a/llvm/test/CodeGen/NVPTX/aggr-param.ll b/llvm/test/CodeGen/NVPTX/aggr-param.ll --- a/llvm/test/CodeGen/NVPTX/aggr-param.ll +++ b/llvm/test/CodeGen/NVPTX/aggr-param.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; Make sure aggregate param types get emitted properly. diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} declare <2 x float> @barv(<2 x float> %input) declare <3 x float> @barv3(<3 x float> %input) diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll --- a/llvm/test/CodeGen/NVPTX/annotations.ll +++ b/llvm/test/CodeGen/NVPTX/annotations.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @texture = internal addrspace(1) global i64 0, align 8 ; CHECK: .global .texref texture diff --git a/llvm/test/CodeGen/NVPTX/arg-lowering.ll b/llvm/test/CodeGen/NVPTX/arg-lowering.ll --- a/llvm/test/CodeGen/NVPTX/arg-lowering.ll +++ b/llvm/test/CodeGen/NVPTX/arg-lowering.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: .visible .func (.param .align 16 .b8 func_retval0[16]) foo0( ; CHECK: .param .align 4 .b8 foo0_param_0[8] diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll --- a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll --- a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll --- a/llvm/test/CodeGen/NVPTX/async-copy.ll +++ b/llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,5 +1,7 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll --- a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: .func test( define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) { diff --git a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll --- a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s -check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: .func test_atomics_scope( define void @test_atomics_scope(float* %fp, float %f, diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_32 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_32 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_32%} %} ; CHECK-LABEL: atom0 diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll --- a/llvm/test/CodeGen/NVPTX/b52037.ll +++ b/llvm/test/CodeGen/NVPTX/b52037.ll @@ -4,6 +4,7 @@ ; https://bugs.llvm.org/show_bug.cgi?id=52037 for the gory details. ; ; RUN: llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | FileCheck %s +; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | %ptxas-verify -arch=sm_70 %} ; CHECK-LABEL: .visible .entry barney( ; CHECK-NOT: .local{{.*}}__local_depot diff --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll --- a/llvm/test/CodeGen/NVPTX/barrier.ll +++ b/llvm/test/CodeGen/NVPTX/barrier.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} declare void @llvm.nvvm.bar.warp.sync(i32) declare void @llvm.nvvm.barrier.sync(i32) diff --git a/llvm/test/CodeGen/NVPTX/bfe.ll b/llvm/test/CodeGen/NVPTX/bfe.ll --- a/llvm/test/CodeGen/NVPTX/bfe.ll +++ b/llvm/test/CodeGen/NVPTX/bfe.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: bfe0 diff --git a/llvm/test/CodeGen/NVPTX/branch-fold.ll b/llvm/test/CodeGen/NVPTX/branch-fold.ll --- a/llvm/test/CodeGen/NVPTX/branch-fold.ll +++ b/llvm/test/CodeGen/NVPTX/branch-fold.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp -verify-machineinstrs | %ptxas-verify -arch=sm_35 %} + ; Disable CGP which also folds branches, so that only BranchFolding is under ; the spotlight. diff --git a/llvm/test/CodeGen/NVPTX/bug17709.ll b/llvm/test/CodeGen/NVPTX/bug17709.ll --- a/llvm/test/CodeGen/NVPTX/bug17709.ll +++ b/llvm/test/CodeGen/NVPTX/bug17709.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; ModuleID = '__kernelgen_main_module' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll --- a/llvm/test/CodeGen/NVPTX/bug21465.ll +++ b/llvm/test/CodeGen/NVPTX/bug21465.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -nvptx-lower-args -S | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-unknown-unknown" diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll --- a/llvm/test/CodeGen/NVPTX/bug22246.ll +++ b/llvm/test/CodeGen/NVPTX/bug22246.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll --- a/llvm/test/CodeGen/NVPTX/bug22322.ll +++ b/llvm/test/CodeGen/NVPTX/bug22322.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} ; Verify that we correctly emit code for extending ldg/ldu. We do not expose ; extending variants in the backend, but the ldg/ldu selection code may pick diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll --- a/llvm/test/CodeGen/NVPTX/bug26185.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} ; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit ; registers in the backend, so these loads need special handling. diff --git a/llvm/test/CodeGen/NVPTX/bug41651.ll b/llvm/test/CodeGen/NVPTX/bug41651.ll --- a/llvm/test/CodeGen/NVPTX/bug41651.ll +++ b/llvm/test/CodeGen/NVPTX/bug41651.ll @@ -1,4 +1,6 @@ ; RUN: llc -filetype=asm -o - %s | FileCheck %s +; RUN: %if ptxas %{ llc -filetype=asm -o - %s | %ptxas-verify %} + target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/bypass-div.ll b/llvm/test/CodeGen/NVPTX/bypass-div.ll --- a/llvm/test/CodeGen/NVPTX/bypass-div.ll +++ b/llvm/test/CodeGen/NVPTX/bypass-div.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} ; 64-bit divides and rems should be split into a fast and slow path where ; the fast path uses a 32-bit operation. diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll --- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Checks how NVPTX lowers alloca buffers and their passing to functions. ; diff --git a/llvm/test/CodeGen/NVPTX/callchain.ll b/llvm/test/CodeGen/NVPTX/callchain.ll --- a/llvm/test/CodeGen/NVPTX/callchain.ll +++ b/llvm/test/CodeGen/NVPTX/callchain.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target triple = "nvptx" diff --git a/llvm/test/CodeGen/NVPTX/calling-conv.ll b/llvm/test/CodeGen/NVPTX/calling-conv.ll --- a/llvm/test/CodeGen/NVPTX/calling-conv.ll +++ b/llvm/test/CodeGen/NVPTX/calling-conv.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; Kernel function using ptx_kernel calling conv diff --git a/llvm/test/CodeGen/NVPTX/calls-with-phi.ll b/llvm/test/CodeGen/NVPTX/calls-with-phi.ll --- a/llvm/test/CodeGen/NVPTX/calls-with-phi.ll +++ b/llvm/test/CodeGen/NVPTX/calls-with-phi.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %} + ; Make sure the example doesn't crash with segfault ; CHECK: .visible .func ({{.*}}) loop diff --git a/llvm/test/CodeGen/NVPTX/combine-min-max.ll b/llvm/test/CodeGen/NVPTX/combine-min-max.ll --- a/llvm/test/CodeGen/NVPTX/combine-min-max.ll +++ b/llvm/test/CodeGen/NVPTX/combine-min-max.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O2 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O2 | %ptxas-verify %} ; ************************************* ; * Cases with no min/max diff --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll --- a/llvm/test/CodeGen/NVPTX/compare-int.ll +++ b/llvm/test/CodeGen/NVPTX/compare-int.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/constant-vectors.ll b/llvm/test/CodeGen/NVPTX/constant-vectors.ll --- a/llvm/test/CodeGen/NVPTX/constant-vectors.ll +++ b/llvm/test/CodeGen/NVPTX/constant-vectors.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/convert-fp.ll b/llvm/test/CodeGen/NVPTX/convert-fp.ll --- a/llvm/test/CodeGen/NVPTX/convert-fp.ll +++ b/llvm/test/CodeGen/NVPTX/convert-fp.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define i16 @cvt_u16_f32(float %x) { ; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}}; diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll --- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; Integer conversions happen inplicitly by loading/storing the proper types diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll --- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; CHECK-LABEL: cvt_rn_bf16x2_f32 diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll --- a/llvm/test/CodeGen/NVPTX/ctlz.ll +++ b/llvm/test/CodeGen/NVPTX/ctlz.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/ctpop.ll b/llvm/test/CodeGen/NVPTX/ctpop.ll --- a/llvm/test/CodeGen/NVPTX/ctpop.ll +++ b/llvm/test/CodeGen/NVPTX/ctpop.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/cttz.ll b/llvm/test/CodeGen/NVPTX/cttz.ll --- a/llvm/test/CodeGen/NVPTX/cttz.ll +++ b/llvm/test/CodeGen/NVPTX/cttz.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/disable-opt.ll b/llvm/test/CodeGen/NVPTX/disable-opt.ll --- a/llvm/test/CodeGen/NVPTX/disable-opt.ll +++ b/llvm/test/CodeGen/NVPTX/disable-opt.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 | %ptxas-verify %} define void @foo(i32* %output) { ; CHECK-LABEL: .visible .func foo( diff --git a/llvm/test/CodeGen/NVPTX/div-ri.ll b/llvm/test/CodeGen/NVPTX/div-ri.ll --- a/llvm/test/CodeGen/NVPTX/div-ri.ll +++ b/llvm/test/CodeGen/NVPTX/div-ri.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 | %ptxas-verify %} define float @foo(float %a) { ; CHECK: div.approx.f32 diff --git a/llvm/test/CodeGen/NVPTX/divrem-combine.ll b/llvm/test/CodeGen/NVPTX/divrem-combine.ll --- a/llvm/test/CodeGen/NVPTX/divrem-combine.ll +++ b/llvm/test/CodeGen/NVPTX/divrem-combine.ll @@ -1,5 +1,7 @@ ; RUN: llc -O2 < %s -march=nvptx -mcpu=sm_35 | FileCheck %s --check-prefix=O2 --check-prefix=CHECK ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_35 | FileCheck %s --check-prefix=O0 --check-prefix=CHECK +; RUN: %if ptxas %{ llc -O2 < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} +; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} ; The following IR ; diff --git a/llvm/test/CodeGen/NVPTX/envreg.ll b/llvm/test/CodeGen/NVPTX/envreg.ll --- a/llvm/test/CodeGen/NVPTX/envreg.ll +++ b/llvm/test/CodeGen/NVPTX/envreg.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} declare i32 @llvm.nvvm.read.ptx.sreg.envreg0() diff --git a/llvm/test/CodeGen/NVPTX/extloadv.ll b/llvm/test/CodeGen/NVPTX/extloadv.ll --- a/llvm/test/CodeGen/NVPTX/extloadv.ll +++ b/llvm/test/CodeGen/NVPTX/extloadv.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} define void @foo(float* nocapture readonly %x_value, double* nocapture %output) #0 { %1 = bitcast float* %x_value to <4 x float>* diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll --- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} declare half @llvm.nvvm.ex2.approx.f16(half) declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>) diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -2,20 +2,41 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-F16-NOFTZ %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} ; ## Full FP16 with FTZ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -denormal-fp-math-f32=preserve-sign \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16-FTZ %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: -denormal-fp-math-f32=preserve-sign \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} ; ## FP16 support explicitly disabled. ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} ; ## FP16 is not supported by hardware. ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_52 \ +; RUN: %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -2,15 +2,31 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} ; ## FP16 support explicitly disabled. ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ +; RUN: -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} ; ## FP16 is not supported by hardware. ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_52 \ +; RUN: %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll --- a/llvm/test/CodeGen/NVPTX/fast-math.ll +++ b/llvm/test/CodeGen/NVPTX/fast-math.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} declare float @llvm.sqrt.f32(float) declare double @llvm.sqrt.f64(double) diff --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll --- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll +++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %} define ptx_device float @t1_f32(float %x, float %y, float %z, float %u, float %v) { diff --git a/llvm/test/CodeGen/NVPTX/fma-disable.ll b/llvm/test/CodeGen/NVPTX/fma-disable.ll --- a/llvm/test/CodeGen/NVPTX/fma-disable.ll +++ b/llvm/test/CodeGen/NVPTX/fma-disable.ll @@ -2,6 +2,10 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} define ptx_device float @test_mul_add_f(float %x, float %y, float %z) { entry: diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll --- a/llvm/test/CodeGen/NVPTX/fma.ll +++ b/llvm/test/CodeGen/NVPTX/fma.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | %ptxas-verify %} declare float @dummy_f32(float, float) #0 declare double @dummy_f64(double, double) #0 diff --git a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll --- a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx | FileCheck %s --check-prefixes=CHECK,CHECK-NONAN ; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK-NAN +; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %} +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} ; ---- minimum ---- diff --git a/llvm/test/CodeGen/NVPTX/fns.ll b/llvm/test/CodeGen/NVPTX/fns.ll --- a/llvm/test/CodeGen/NVPTX/fns.ll +++ b/llvm/test/CodeGen/NVPTX/fns.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} declare i32 @llvm.nvvm.fns(i32, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll --- a/llvm/test/CodeGen/NVPTX/fp-contract.ll +++ b/llvm/test/CodeGen/NVPTX/fp-contract.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} target triple = "nvptx64-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/fp-literals.ll b/llvm/test/CodeGen/NVPTX/fp-literals.ll --- a/llvm/test/CodeGen/NVPTX/fp-literals.ll +++ b/llvm/test/CodeGen/NVPTX/fp-literals.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/fp16.ll b/llvm/test/CodeGen/NVPTX/fp16.ll --- a/llvm/test/CodeGen/NVPTX/fp16.ll +++ b/llvm/test/CodeGen/NVPTX/fp16.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s +; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s | %ptxas-verify %} declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone diff --git a/llvm/test/CodeGen/NVPTX/function-align.ll b/llvm/test/CodeGen/NVPTX/function-align.ll --- a/llvm/test/CodeGen/NVPTX/function-align.ll +++ b/llvm/test/CodeGen/NVPTX/function-align.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK-NOT: .align 2 define ptx_device void @foo() align 2 { diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll --- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll +++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/global-addrspace.ll b/llvm/test/CodeGen/NVPTX/global-addrspace.ll --- a/llvm/test/CodeGen/NVPTX/global-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/global-addrspace.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; PTX32: .visible .global .align 4 .u32 i; ; PTX32: .visible .const .align 4 .u32 j; diff --git a/llvm/test/CodeGen/NVPTX/global-ordering.ll b/llvm/test/CodeGen/NVPTX/global-ordering.ll --- a/llvm/test/CodeGen/NVPTX/global-ordering.ll +++ b/llvm/test/CodeGen/NVPTX/global-ordering.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Make sure we emit these globals in def-use order diff --git a/llvm/test/CodeGen/NVPTX/global-variable-big.ll b/llvm/test/CodeGen/NVPTX/global-variable-big.ll --- a/llvm/test/CodeGen/NVPTX/global-variable-big.ll +++ b/llvm/test/CodeGen/NVPTX/global-variable-big.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s | FileCheck %s +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} + target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/global-visibility.ll b/llvm/test/CodeGen/NVPTX/global-visibility.ll --- a/llvm/test/CodeGen/NVPTX/global-visibility.ll +++ b/llvm/test/CodeGen/NVPTX/global-visibility.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; PTX does not support .hidden or .protected. ; Make sure we do not emit them. diff --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll --- a/llvm/test/CodeGen/NVPTX/globals_init.ll +++ b/llvm/test/CodeGen/NVPTX/globals_init.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; Make sure the globals constant initializers are not prone to host endianess ; issues. diff --git a/llvm/test/CodeGen/NVPTX/globals_lowering.ll b/llvm/test/CodeGen/NVPTX/globals_lowering.ll --- a/llvm/test/CodeGen/NVPTX/globals_lowering.ll +++ b/llvm/test/CodeGen/NVPTX/globals_lowering.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static | FileCheck %s --check-prefix CHK +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static | %ptxas-verify %} %MyStruct = type { i32, i32, float } @Gbl = internal addrspace(3) global [1024 x %MyStruct] zeroinitializer diff --git a/llvm/test/CodeGen/NVPTX/half.ll b/llvm/test/CodeGen/NVPTX/half.ll --- a/llvm/test/CodeGen/NVPTX/half.ll +++ b/llvm/test/CodeGen/NVPTX/half.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %} ; CHECK: .b8 half_array[8] = {1, 2, 3, 4, 5, 6, 7, 8}; @"half_array" = addrspace(1) constant [4 x half] diff --git a/llvm/test/CodeGen/NVPTX/i1-global.ll b/llvm/test/CodeGen/NVPTX/i1-global.ll --- a/llvm/test/CodeGen/NVPTX/i1-global.ll +++ b/llvm/test/CodeGen/NVPTX/i1-global.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll --- a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll +++ b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: foo ; CHECK: setp diff --git a/llvm/test/CodeGen/NVPTX/i1-param.ll b/llvm/test/CodeGen/NVPTX/i1-param.ll --- a/llvm/test/CodeGen/NVPTX/i1-param.ll +++ b/llvm/test/CodeGen/NVPTX/i1-param.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/i128-global.ll b/llvm/test/CodeGen/NVPTX/i128-global.ll --- a/llvm/test/CodeGen/NVPTX/i128-global.ll +++ b/llvm/test/CodeGen/NVPTX/i128-global.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @G1 = global i128 1 diff --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll --- a/llvm/test/CodeGen/NVPTX/i128-param.ll +++ b/llvm/test/CodeGen/NVPTX/i128-param.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: .visible .func callee( ; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16], diff --git a/llvm/test/CodeGen/NVPTX/i128-retval.ll b/llvm/test/CodeGen/NVPTX/i128-retval.ll --- a/llvm/test/CodeGen/NVPTX/i128-retval.ll +++ b/llvm/test/CodeGen/NVPTX/i128-retval.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee( define i128 @callee(i128) { diff --git a/llvm/test/CodeGen/NVPTX/i128-struct.ll b/llvm/test/CodeGen/NVPTX/i128-struct.ll --- a/llvm/test/CodeGen/NVPTX/i128-struct.ll +++ b/llvm/test/CodeGen/NVPTX/i128-struct.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[32]) foo( define { i128, i128 } @foo(i64 %a, i32 %b) { diff --git a/llvm/test/CodeGen/NVPTX/i8-param.ll b/llvm/test/CodeGen/NVPTX/i8-param.ll --- a/llvm/test/CodeGen/NVPTX/i8-param.ll +++ b/llvm/test/CodeGen/NVPTX/i8-param.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll --- a/llvm/test/CodeGen/NVPTX/idioms.ll +++ b/llvm/test/CodeGen/NVPTX/idioms.ll @@ -2,6 +2,8 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: abs_i16( define i16 @abs_i16(i16 %a) { diff --git a/llvm/test/CodeGen/NVPTX/imad.ll b/llvm/test/CodeGen/NVPTX/imad.ll --- a/llvm/test/CodeGen/NVPTX/imad.ll +++ b/llvm/test/CodeGen/NVPTX/imad.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: imad define i32 @imad(i32 %a, i32 %b, i32 %c) { diff --git a/llvm/test/CodeGen/NVPTX/inline-asm.ll b/llvm/test/CodeGen/NVPTX/inline-asm.ll --- a/llvm/test/CodeGen/NVPTX/inline-asm.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} define float @test(float %x) { entry: diff --git a/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll b/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll --- a/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll +++ b/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=nvptx < %s | FileCheck %s +; RUN: %if ptxas %{ llc -march=nvptx < %s | %ptxas-verify %} ; Test that %c works with immediates ; CHECK-LABEL: test_inlineasm_c_output_template0 diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -10,6 +10,8 @@ ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ ; RUN: -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \ ; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_device i32 @test_tid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x; diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll --- a/llvm/test/CodeGen/NVPTX/intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: test_fabsf( define float @test_fabsf(float %f) { diff --git a/llvm/test/CodeGen/NVPTX/isspacep.ll b/llvm/test/CodeGen/NVPTX/isspacep.ll --- a/llvm/test/CodeGen/NVPTX/isspacep.ll +++ b/llvm/test/CodeGen/NVPTX/isspacep.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} declare i1 @llvm.nvvm.isspacep.const(i8*) readnone noinline declare i1 @llvm.nvvm.isspacep.global(i8*) readnone noinline diff --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll --- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll @@ -1,6 +1,9 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} ;; i8 diff --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll --- a/llvm/test/CodeGen/NVPTX/ld-generic.ll +++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; i8 diff --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py --- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py +++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py @@ -4,6 +4,8 @@ # RUN: %python %s > %t.ll # RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll # RUN: llc < %t.ll -march=nvptx -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P32 %t.ll +# RUN: %if ptxas %{ llc < %t.ll -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %} +# RUN: %if ptxas %{ llc < %t.ll -march=nvptx -mcpu=sm_30 | %ptxas-verify %} from __future__ import print_function diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll --- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} ; Check that invariant loads from the global addrspace are lowered to ; ld.global.nc. diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} declare <4 x float> @bar() diff --git a/llvm/test/CodeGen/NVPTX/ldu-i8.ll b/llvm/test/CodeGen/NVPTX/ldu-i8.ll --- a/llvm/test/CodeGen/NVPTX/ldu-i8.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-i8.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll --- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align) diff --git a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll --- a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll b/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll --- a/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll +++ b/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %} + ; Allow to make libcalls that are defined in the current module declare i8* @malloc(i64) diff --git a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll --- a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll +++ b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: plain define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr { diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefix=SM20 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck -check-prefix=SM35 %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-unknown-unknown" diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; Ensure we access the local stack properly diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll --- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | %ptxas-verify -arch=sm_35 %} ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to ; llvm.mem* intrinsics get lowered to loops. diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll --- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll +++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-unknown-unknown" diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefix IR ; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefix PTX +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64 ; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32 +; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} %struct.ham = type { [4 x i32] } diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll --- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll +++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/machine-sink.ll b/llvm/test/CodeGen/NVPTX/machine-sink.ll --- a/llvm/test/CodeGen/NVPTX/machine-sink.ll +++ b/llvm/test/CodeGen/NVPTX/machine-sink.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/managed.ll b/llvm/test/CodeGen/NVPTX/managed.ll --- a/llvm/test/CodeGen/NVPTX/managed.ll +++ b/llvm/test/CodeGen/NVPTX/managed.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} ; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s --check-prefix ERROR ; ERROR: LLVM ERROR: .attribute(.managed) requires PTX version >= 4.0 and sm_30 diff --git a/llvm/test/CodeGen/NVPTX/match.ll b/llvm/test/CodeGen/NVPTX/match.ll --- a/llvm/test/CodeGen/NVPTX/match.ll +++ b/llvm/test/CodeGen/NVPTX/match.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32) declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify -arch=sm_53 %} declare half @llvm.nvvm.fma.rn.f16(half, half, half) declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare i16 @llvm.nvvm.abs.bf16(i16) declare i32 @llvm.nvvm.abs.bf16x2(i32) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s +; RUN: %if ptxas-11.2 %{ llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half) declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -1,6 +1,10 @@ ; RUN: llc < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16 ; RUN: llc < %s -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK-F16 ; RUN: llc < %s -mcpu=sm_80 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16 +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} +; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %} + target triple = "nvptx64-nvidia-cuda" ; Checks that llvm intrinsics for math functions are correctly lowered to PTX. diff --git a/llvm/test/CodeGen/NVPTX/mbarrier.ll b/llvm/test/CodeGen/NVPTX/mbarrier.ll --- a/llvm/test/CodeGen/NVPTX/mbarrier.ll +++ b/llvm/test/CodeGen/NVPTX/mbarrier.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64 +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b) declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b) diff --git a/llvm/test/CodeGen/NVPTX/minmax-negative.ll b/llvm/test/CodeGen/NVPTX/minmax-negative.ll --- a/llvm/test/CodeGen/NVPTX/minmax-negative.ll +++ b/llvm/test/CodeGen/NVPTX/minmax-negative.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -O0 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -O0 | %ptxas-verify %} define i16 @test1(i16* %sur1) { ; CHECK-NOT: mov.u16 %rs{{[0-9]+}}, 32767 diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll --- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/module-inline-asm.ll b/llvm/test/CodeGen/NVPTX/module-inline-asm.ll --- a/llvm/test/CodeGen/NVPTX/module-inline-asm.ll +++ b/llvm/test/CodeGen/NVPTX/module-inline-asm.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll --- a/llvm/test/CodeGen/NVPTX/mulwide.ll +++ b/llvm/test/CodeGen/NVPTX/mulwide.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O3 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 | %ptxas-verify %} ; OPT-LABEL: @mulwide16 ; NOOPT-LABEL: @mulwide16 diff --git a/llvm/test/CodeGen/NVPTX/named-barriers.ll b/llvm/test/CodeGen/NVPTX/named-barriers.ll --- a/llvm/test/CodeGen/NVPTX/named-barriers.ll +++ b/llvm/test/CodeGen/NVPTX/named-barriers.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Use bar.sync to arrive at a pre-computed barrier number and ; wait for all threads in CTA to also arrive: diff --git a/llvm/test/CodeGen/NVPTX/no-extra-parens.ll b/llvm/test/CodeGen/NVPTX/no-extra-parens.ll --- a/llvm/test/CodeGen/NVPTX/no-extra-parens.ll +++ b/llvm/test/CodeGen/NVPTX/no-extra-parens.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; ptxas has no special meaning for '$' character, so it should be used ; without parens. diff --git a/llvm/test/CodeGen/NVPTX/nofunc.ll b/llvm/test/CodeGen/NVPTX/nofunc.ll --- a/llvm/test/CodeGen/NVPTX/nofunc.ll +++ b/llvm/test/CodeGen/NVPTX/nofunc.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Test that we don't crash if we're compiling a module with function references, ; but without any functions in it. diff --git a/llvm/test/CodeGen/NVPTX/nounroll.ll b/llvm/test/CodeGen/NVPTX/nounroll.ll --- a/llvm/test/CodeGen/NVPTX/nounroll.ll +++ b/llvm/test/CodeGen/NVPTX/nounroll.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-unknown-unknown" diff --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll --- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target triple = "nvptx-unknown-nvcl" diff --git a/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll b/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll --- a/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %} ; ; NVPTXTargetLowering::getFunctionParamOptimizedAlign, which was introduces in ; D120129, contained a poorly designed assertion checking that a function with diff --git a/llvm/test/CodeGen/NVPTX/param-align.ll b/llvm/test/CodeGen/NVPTX/param-align.ll --- a/llvm/test/CodeGen/NVPTX/param-align.ll +++ b/llvm/test/CodeGen/NVPTX/param-align.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ;;; Need 4-byte alignment on float* passed byval define ptx_device void @t1(float* byval(float) %x) { diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -1,5 +1,6 @@ ; Verifies correctness of load/store of parameters and return values. ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify -arch=sm_35 %} %s_i1 = type { i1 } %s_i8 = type { i8 } diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %} ; ; Check that parameters of a __device__ function with private or internal ; linkage called from a __global__ (kernel) function get increased alignment, diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll --- a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %} ; ; Check that parameters of a __global__ (kernel) function do not get increased ; alignment, and no additional vectorization is performed on loads/stores with diff --git a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll --- a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll +++ b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s +; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s | %ptxas-verify %} ; Tests the following pattern: ; (X & 8) != 0 --> (X & 8) >> 3 diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll --- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_kernel void @t1(i1* %a) { ; PTX32: mov.u16 %rs{{[0-9]+}}, 0; diff --git a/llvm/test/CodeGen/NVPTX/pr16278.ll b/llvm/test/CodeGen/NVPTX/pr16278.ll --- a/llvm/test/CodeGen/NVPTX/pr16278.ll +++ b/llvm/test/CodeGen/NVPTX/pr16278.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} @one_f = addrspace(4) global float 1.000000e+00, align 4 diff --git a/llvm/test/CodeGen/NVPTX/pr17529.ll b/llvm/test/CodeGen/NVPTX/pr17529.ll --- a/llvm/test/CodeGen/NVPTX/pr17529.ll +++ b/llvm/test/CodeGen/NVPTX/pr17529.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll --- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll +++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} ; Check load from constant global variables. These loads should be ; ld.global.nc (aka ldg). diff --git a/llvm/test/CodeGen/NVPTX/redux-sync.ll b/llvm/test/CodeGen/NVPTX/redux-sync.ll --- a/llvm/test/CodeGen/NVPTX/redux-sync.ll +++ b/llvm/test/CodeGen/NVPTX/redux-sync.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare i32 @llvm.nvvm.redux.sync.umin(i32, i32) ; CHECK-LABEL: .func{{.*}}redux_sync_min_u32 diff --git a/llvm/test/CodeGen/NVPTX/refl1.ll b/llvm/test/CodeGen/NVPTX/refl1.ll --- a/llvm/test/CodeGen/NVPTX/refl1.ll +++ b/llvm/test/CodeGen/NVPTX/refl1.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/reg-copy.ll b/llvm/test/CodeGen/NVPTX/reg-copy.ll --- a/llvm/test/CodeGen/NVPTX/reg-copy.ll +++ b/llvm/test/CodeGen/NVPTX/reg-copy.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-unknown-unknown" diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll --- a/llvm/test/CodeGen/NVPTX/reg-types.ll +++ b/llvm/test/CodeGen/NVPTX/reg-types.ll @@ -3,6 +3,8 @@ ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT +; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: .visible .func func() ; NO8BIT-LABEL: .visible .func func() diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %} declare i32 @llvm.nvvm.rotate.b32(i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll --- a/llvm/test/CodeGen/NVPTX/rotate_64.ll +++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx | FileCheck %s - +; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %} declare i64 @llvm.nvvm.rotate.b64(i64, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) diff --git a/llvm/test/CodeGen/NVPTX/sched1.ll b/llvm/test/CodeGen/NVPTX/sched1.ll --- a/llvm/test/CodeGen/NVPTX/sched1.ll +++ b/llvm/test/CodeGen/NVPTX/sched1.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; Ensure source scheduling is working diff --git a/llvm/test/CodeGen/NVPTX/sched2.ll b/llvm/test/CodeGen/NVPTX/sched2.ll --- a/llvm/test/CodeGen/NVPTX/sched2.ll +++ b/llvm/test/CodeGen/NVPTX/sched2.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} define void @foo(<2 x i32>* %a) { ; CHECK: .func foo diff --git a/llvm/test/CodeGen/NVPTX/sext-in-reg.ll b/llvm/test/CodeGen/NVPTX/sext-in-reg.ll --- a/llvm/test/CodeGen/NVPTX/sext-in-reg.ll +++ b/llvm/test/CodeGen/NVPTX/sext-in-reg.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/sext-params.ll b/llvm/test/CodeGen/NVPTX/sext-params.ll --- a/llvm/test/CodeGen/NVPTX/sext-params.ll +++ b/llvm/test/CodeGen/NVPTX/sext-params.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/shfl-p.ll b/llvm/test/CodeGen/NVPTX/shfl-p.ll --- a/llvm/test/CodeGen/NVPTX/shfl-p.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-p.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} declare {i32, i1} @llvm.nvvm.shfl.down.i32p(i32, i32, i32) declare {float, i1} @llvm.nvvm.shfl.down.f32p(float, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll --- a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32) declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync.ll b/llvm/test/CodeGen/NVPTX/shfl-sync.ll --- a/llvm/test/CodeGen/NVPTX/shfl-sync.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/shfl.ll b/llvm/test/CodeGen/NVPTX/shfl.ll --- a/llvm/test/CodeGen/NVPTX/shfl.ll +++ b/llvm/test/CodeGen/NVPTX/shfl.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32) declare float @llvm.nvvm.shfl.down.f32(float, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/shift-parts.ll b/llvm/test/CodeGen/NVPTX/shift-parts.ll --- a/llvm/test/CodeGen/NVPTX/shift-parts.ll +++ b/llvm/test/CodeGen/NVPTX/shift-parts.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: shift_parts_left_128 define void @shift_parts_left_128(i128* %val, i128* %amtptr) { diff --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll --- a/llvm/test/CodeGen/NVPTX/simple-call.ll +++ b/llvm/test/CodeGen/NVPTX/simple-call.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s - - +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; CHECK: .func ({{.*}}) device_func define float @device_func(float %a) noinline { diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll --- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll +++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll @@ -1,5 +1,9 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \ ; RUN: | FileCheck %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \ +; RUN: | %ptxas-verify \ +; RUN: %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll --- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll @@ -1,7 +1,9 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 - +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} ;; i8 ; ALL-LABEL: st_global_i8 diff --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll --- a/llvm/test/CodeGen/NVPTX/st-generic.ll +++ b/llvm/test/CodeGen/NVPTX/st-generic.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 - +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; i8 diff --git a/llvm/test/CodeGen/NVPTX/store-retval.ll b/llvm/test/CodeGen/NVPTX/store-retval.ll --- a/llvm/test/CodeGen/NVPTX/store-retval.ll +++ b/llvm/test/CodeGen/NVPTX/store-retval.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s --mtriple=nvptx-unknown-unknown | FileCheck %s +; RUN: %if ptxas %{ llc < %s --mtriple=nvptx-unknown-unknown | %ptxas-verify %} ; ; This is IR generated with clang using -O3 optimization level ; and nvptx-unknown-unknown target from the following C code. diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll --- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll --- a/llvm/test/CodeGen/NVPTX/surf-read.ll +++ b/llvm/test/CodeGen/NVPTX/surf-read.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx-unknown-nvcl" diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py --- a/llvm/test/CodeGen/NVPTX/surf-tex.py +++ b/llvm/test/CodeGen/NVPTX/surf-tex.py @@ -1,11 +1,13 @@ # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll # RUN: llc %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll +# RUN: %if ptxas %{ llc %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %} # We only need to run this second time for texture tests, because # there is a difference between unified and non-unified intrinsics. # # RUN: %python %s --target=nvcl --tests=suld,sust,tex,tld4 --gen-list-append --gen-list=%t.list > %t-nvcl.ll # RUN: llc %t-nvcl.ll -verify-machineinstrs -o - | FileCheck %t-nvcl.ll +# RUN: %if ptxas %{ llc %t-nvcl.ll -verify-machineinstrs -o - | %ptxas-verify %} # Verify that all instructions and intrinsics defined in TableGen # files are tested. The command may fail if the files are changed diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll --- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/surf-write.ll b/llvm/test/CodeGen/NVPTX/surf-write.ll --- a/llvm/test/CodeGen/NVPTX/surf-write.ll +++ b/llvm/test/CodeGen/NVPTX/surf-write.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx-unknown-nvcl" diff --git a/llvm/test/CodeGen/NVPTX/symbol-naming.ll b/llvm/test/CodeGen/NVPTX/symbol-naming.ll --- a/llvm/test/CodeGen/NVPTX/symbol-naming.ll +++ b/llvm/test/CodeGen/NVPTX/symbol-naming.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Verify that the NVPTX target removes invalid symbol names prior to emitting ; PTX. diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll --- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll --- a/llvm/test/CodeGen/NVPTX/tex-read.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx-unknown-nvcl" diff --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll --- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll +++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/tid-range.ll b/llvm/test/CodeGen/NVPTX/tid-range.ll --- a/llvm/test/CodeGen/NVPTX/tid-range.ll +++ b/llvm/test/CodeGen/NVPTX/tid-range.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -march=nvptx64 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %} + declare i32 @get_register() define i1 @test1() { diff --git a/llvm/test/CodeGen/NVPTX/tuple-literal.ll b/llvm/test/CodeGen/NVPTX/tuple-literal.ll --- a/llvm/test/CodeGen/NVPTX/tuple-literal.ll +++ b/llvm/test/CodeGen/NVPTX/tuple-literal.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 +; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %} define ptx_device void @test_function({i8, i8}*) { ret void diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll --- a/llvm/test/CodeGen/NVPTX/vec8.ll +++ b/llvm/test/CodeGen/NVPTX/vec8.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll --- a/llvm/test/CodeGen/NVPTX/vector-args.ll +++ b/llvm/test/CodeGen/NVPTX/vector-args.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} define float @foo(<2 x float> %a) { ; CHECK: .func (.param .b32 func_retval0) foo diff --git a/llvm/test/CodeGen/NVPTX/vector-call.ll b/llvm/test/CodeGen/NVPTX/vector-call.ll --- a/llvm/test/CodeGen/NVPTX/vector-call.ll +++ b/llvm/test/CodeGen/NVPTX/vector-call.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll --- a/llvm/test/CodeGen/NVPTX/vector-compare.ll +++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 +; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %} ; This test makes sure that the result of vector compares are properly ; scalarized. If codegen fails, then the type legalizer incorrectly diff --git a/llvm/test/CodeGen/NVPTX/vector-global.ll b/llvm/test/CodeGen/NVPTX/vector-global.ll --- a/llvm/test/CodeGen/NVPTX/vector-global.ll +++ b/llvm/test/CodeGen/NVPTX/vector-global.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; Even though general vector types are not supported in PTX, we can still ; optimize loads/stores with pseudo-vector instructions of the form: diff --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll --- a/llvm/test/CodeGen/NVPTX/vector-select.ll +++ b/llvm/test/CodeGen/NVPTX/vector-select.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 +; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %} ; This test makes sure that vector selects are scalarized by the type legalizer. ; If not, type legalization will fail. diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll --- a/llvm/test/CodeGen/NVPTX/vector-stores.ll +++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: .visible .func foo1 ; CHECK: st.v2.f32 diff --git a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll --- a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll +++ b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s | FileCheck %s +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} + target triple = "nvptx64-nvidia-cuda" ; CHECK-LABEL: test1 diff --git a/llvm/test/CodeGen/NVPTX/vote.ll b/llvm/test/CodeGen/NVPTX/vote.ll --- a/llvm/test/CodeGen/NVPTX/vote.ll +++ b/llvm/test/CodeGen/NVPTX/vote.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} declare i1 @llvm.nvvm.vote.all(i1) ; CHECK-LABEL: .func{{.*}}vote_all diff --git a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll --- a/llvm/test/CodeGen/NVPTX/weak-global.ll +++ b/llvm/test/CodeGen/NVPTX/weak-global.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: .weak .global .align 4 .u32 g @g = common addrspace(1) global i32 zeroinitializer diff --git a/llvm/test/CodeGen/NVPTX/weak-linkage.ll b/llvm/test/CodeGen/NVPTX/weak-linkage.ll --- a/llvm/test/CodeGen/NVPTX/weak-linkage.ll +++ b/llvm/test/CodeGen/NVPTX/weak-linkage.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; CHECK: // .weak foo ; CHECK: .weak .func foo diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py --- a/llvm/test/CodeGen/NVPTX/wmma.py +++ b/llvm/test/CodeGen/NVPTX/wmma.py @@ -9,6 +9,10 @@ # RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ # RUN: | FileCheck %t-ptx60-sm_70.ll +# RUN: %if ptxas %{ \ +# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ +# RUN: | %ptxas-verify -arch=sm_70 \ +# RUN: %} # Check all variants of instructions supported by PTX61 on SM70 # RUN: %python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll @@ -18,6 +22,10 @@ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ # RUN: | FileCheck %t-ptx61-sm_70.ll +# RUN: %if ptxas-9.1 %{ \ +# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ +# RUN: | %ptxas-verify -arch=sm_70 \ +# RUN: %} # Check all variants of instructions supported by PTX63 on SM72 # RUN: %python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll @@ -27,6 +35,10 @@ # RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_72.ll +# RUN: %if ptxas-10.0 %{ \ +# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ +# RUN: | %ptxas-verify -arch=sm_72 \ +# RUN: %} # Check all variants of instructions supported by PTX63 on SM75 # RUN: %python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll @@ -36,6 +48,10 @@ # RUN: --check-prefixes=INTRINSICS,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_75.ll +# RUN: %if ptxas-10.0 %{ \ +# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ +# RUN: | %ptxas-verify -arch=sm_75 \ +# RUN: %} # Check all variants of instructions supported by PTX64 on SM70+ # RUN: %python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll @@ -45,6 +61,10 @@ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ # RUN: | FileCheck %t-ptx64-sm_70.ll +# RUN: %if ptxas-10.1 %{ \ +# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ +# RUN: | %ptxas-verify -arch=sm_70 \ +# RUN: %} # Check all variants of instructions supported by PTX65 on SM75+ # RUN: %python %s --ptx=65 --gpu-arch=75 > %t-ptx65-sm_75.ll @@ -54,6 +74,10 @@ # RUN: --check-prefixes=INTRINSICS # RUN: llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \ # RUN: | FileCheck %t-ptx65-sm_75.ll +# RUN: %if ptxas-10.2 %{ \ +# RUN: llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \ +# RUN: | %ptxas-verify -arch=sm_75 \ +# RUN: %} # Check all variants of instructions supported by PTX71 on SM80+ # RUN: %python %s --ptx=71 --gpu-arch=80 > %t-ptx71-sm_80.ll @@ -63,6 +87,10 @@ # RUN: --check-prefixes=INTRINSICS # RUN: llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \ # RUN: | FileCheck %t-ptx71-sm_80.ll +# RUN: %if ptxas-11.1 %{ \ +# RUN: llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \ +# RUN: | %ptxas-verify -arch=sm_80 \ +# RUN: %} from __future__ import print_function diff --git a/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll b/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll --- a/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll +++ b/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %} ; The zeroext attribute below should be silently ignored because ; we can pass a 32-bit integer across a function call without diff --git a/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll b/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll --- a/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll +++ b/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda %if ptxas %{ | %ptxas-verify %} + ; Don't crash for a function w/o debug info that contains an instruction w/ ; debug info. ; Reported as #51079 diff --git a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll --- a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll +++ b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; CHECK: .target sm_{{[0-9]+}}, debug diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll --- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll +++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; CHECK: .target sm_20, debug diff --git a/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll b/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll --- a/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll +++ b/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll @@ -1,4 +1,6 @@ ; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s +; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s | %ptxas-verify %} + ; Generated with -O1 from: ; int f1(); ; void f2(int*); diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll --- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s +; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s | %ptxas-verify %} @GLOBAL = addrspace(1) externally_initialized global i32 0, align 4, !dbg !0 @SHARED = addrspace(3) externally_initialized global i32 undef, align 4, !dbg !6 diff --git a/llvm/test/DebugInfo/NVPTX/debug-empty.ll b/llvm/test/DebugInfo/NVPTX/debug-empty.ll --- a/llvm/test/DebugInfo/NVPTX/debug-empty.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-empty.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; CHECK: .target sm_{{[0-9]+$}} ; CHECK: .section .debug_loc { } diff --git a/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll b/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll --- a/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; // Bitcode in this test case is reduced version of compiled code below: ;extern "C" { diff --git a/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll b/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll --- a/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; // Bitcode int this test case is reduced version of compiled code below: ;extern "C" { diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll --- a/llvm/test/DebugInfo/NVPTX/debug-info.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; // Bitcode int this test case is reduced version of compiled code below: ;__device__ inline void res(float x, float y, float *res) { *res = x + y; } diff --git a/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll b/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll --- a/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s +; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s | %ptxas-verify %} ; CHECK: .target sm_{{[0-9]+}}, debug diff --git a/llvm/test/DebugInfo/NVPTX/debug-name-table.ll b/llvm/test/DebugInfo/NVPTX/debug-name-table.ll --- a/llvm/test/DebugInfo/NVPTX/debug-name-table.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-name-table.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=nvptx64-nvidia-cuda -mattr=+ptx75 < %s | FileCheck %s +; RUN: %if ptxas-11.5 %{ llc -mtriple=nvptx64-nvidia-cuda -mattr=+ptx75 < %s | %ptxas-verify %} ; DICompileUnit without 'nameTableKind: None' results in ; debug_pubnames and debug_pubtypes sections in DWARF. These sections diff --git a/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll b/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll --- a/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll +++ b/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck --check-prefix=CHECK-NODIR %s ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -dwarf-directory=1 | FileCheck --check-prefix=CHECK-DIR %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; CHECK-NODIR: .file {{[0-9]+}} "/tmp/dbginfo/a{{/|\\\\}}a.cpp" ; @@ -15,7 +16,7 @@ !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8, !9} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2, nameTableKind: None) !1 = !DIFile(filename: "a.cpp", directory: "/tmp/dbginfo/a") !2 = !{} !4 = distinct !DISubprogram(name: "func", linkageName: "_Z4funcv", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !1, type: !6, retainedNodes: !2) diff --git a/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll b/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll --- a/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll +++ b/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %} ; Produced at -O0 from: ; struct { diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -192,6 +192,53 @@ ToolSubst('OrcV2CBindingsVeryLazy', unresolved='ignore'), ToolSubst('dxil-dis', unresolved='ignore')]) +# Find (major, minor) version of ptxas +def ptxas_version(ptxas): + ptxas_cmd = subprocess.Popen([ptxas, '--version'], stdout=subprocess.PIPE) + ptxas_out = ptxas_cmd.stdout.read().decode('ascii') + ptxas_cmd.wait() + match = re.search('release (\d+)\.(\d+)', ptxas_out) + if match: + return (int(match.group(1)), int(match.group(2))) + print('couldn\'t determine ptxas version') + return None + +def enable_ptxas(ptxas_executable): + version = ptxas_version(ptxas_executable) + if version: + # ptxas is supposed to be backward compatible with previous + # versions, so add a feature for every known version prior to + # the current one. + ptxas_known_versions = [ + (9, 0), (9, 1), (9, 2), + (10, 0), (10, 1), (10, 2), + (11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6), + ] + + # ignore ptxas if its version is below the minimum supported + # version + min_version = ptxas_known_versions[0] + if version[0] < min_version[0] or version[1] < min_version[1]: + print( + 'Warning: ptxas version {}.{} is not supported'.format( + version[0], version[1])) + return + + for known_major, known_minor in ptxas_known_versions: + if known_major <= version[0] and known_minor <= version[1]: + config.available_features.add( + 'ptxas-{}.{}'.format(known_major, known_minor)) + + config.available_features.add('ptxas') + tools.extend([ToolSubst('%ptxas', ptxas_executable), + ToolSubst('%ptxas-verify', '{} -c -o /dev/null -'.format( + ptxas_executable))]) + +ptxas_executable = \ + os.environ.get('LLVM_PTXAS_EXECUTABLE', None) or config.ptxas_executable +if ptxas_executable: + enable_ptxas(ptxas_executable) + llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir) # Targets diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -23,6 +23,7 @@ config.ocaml_flags = "@OCAMLFLAGS@" config.include_go_tests = @LLVM_INCLUDE_GO_TESTS@ config.go_executable = "@GO_EXECUTABLE@" +config.ptxas_executable = "@PXTAS_EXECUTABLE@" config.enable_shared = @ENABLE_SHARED@ config.enable_assertions = @ENABLE_ASSERTIONS@ config.targets_to_build = "@TARGETS_TO_BUILD@"