diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -7081,7 +7081,11 @@ return ABIArgInfo::getDirect( CGInfo.getCUDADeviceBuiltinTextureDeviceType()); } - return getNaturalAlignIndirect(Ty, /* byval */ true); + return ABIArgInfo::getIndirectAliased( + getContext().getTypeAlignInChars(Ty), + getContext().getTargetAddressSpace( + getLangASFromTargetAS(/*ADDRESS_SPACE_PARAM*/ 101)), + false /*Realign*/, nullptr /*Padding*/); } if (const auto *EIT = Ty->getAs()) { diff --git a/clang/test/CodeGen/nvptx-abi.c b/clang/test/CodeGen/nvptx-abi.c --- a/clang/test/CodeGen/nvptx-abi.c +++ b/clang/test/CodeGen/nvptx-abi.c @@ -21,14 +21,14 @@ void foo(float4_t x) { // CHECK-LABEL: @foo -// CHECK: %struct.float4_s* byval(%struct.float4_s) align 4 %x +// CHECK: %struct.float4_s addrspace(101)* byref(%struct.float4_s) align 4 %0 } void fooN(float4_t x, float4_t y, float4_t z) { // CHECK-LABEL: @fooN -// CHECK: %struct.float4_s* byval(%struct.float4_s) align 4 %x -// CHECK: %struct.float4_s* byval(%struct.float4_s) align 4 %y -// CHECK: %struct.float4_s* byval(%struct.float4_s) align 4 %z +// CHECK: %struct.float4_s addrspace(101)* byref(%struct.float4_s) align 4 %0 +// CHECK: %struct.float4_s addrspace(101)* byref(%struct.float4_s) align 4 %1 +// CHECK: %struct.float4_s addrspace(101)* byref(%struct.float4_s) align 4 %2 } typedef struct nested_s { @@ -39,5 +39,5 @@ void baz(nested_t x) { // CHECK-LABEL: @baz -// CHECK: %struct.nested_s* byval(%struct.nested_s) align 8 %x) +// CHECK: %struct.nested_s addrspace(101)* byref(%struct.nested_s) align 8 %0) } diff --git a/clang/test/CodeGenCUDA/kernel-args-alignment.cu b/clang/test/CodeGenCUDA/kernel-args-alignment.cu --- a/clang/test/CodeGenCUDA/kernel-args-alignment.cu +++ b/clang/test/CodeGenCUDA/kernel-args-alignment.cu @@ -36,5 +36,5 @@ // HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) // DEVICE-LABEL: @_Z6kernelc1SPi -// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval(%struct.S) align 8{{[^,]*}}, i32* +// DEVICE-SAME: i8{{[^,]*}}, %struct.S addrspace(101)* byref(%struct.S) align 8{{[^,]*}}, i32* __global__ void kernel(char a, S s, int *b) {} diff --git a/clang/test/CodeGenCUDA/kernel-args.cu b/clang/test/CodeGenCUDA/kernel-args.cu --- a/clang/test/CodeGenCUDA/kernel-args.cu +++ b/clang/test/CodeGenCUDA/kernel-args.cu @@ -10,14 +10,14 @@ }; // AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}) -// NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 8 %x) +// NVPTX: define void @_Z6kernel1A(%struct.A addrspace(101)* byref(%struct.A) align 8 %0) __global__ void kernel(A x) { } class Kernel { public: // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}) - // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 8 %x) + // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(101)* byref(%struct.A) align 8 %0) static __global__ void memberKernel(A x){} template static __global__ void templateMemberKernel(T x) {} }; @@ -31,10 +31,10 @@ void test() { Kernel K; // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}} - // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 8 %x) + // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A addrspace(101)* byref(%struct.A) align 8 %0) launch((void*)templateKernel); // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}} - // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 8 %x) + // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(101)* byref(%struct.A) align 8 %0) launch((void*)Kernel::templateMemberKernel); } diff --git a/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp b/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp --- a/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp +++ b/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp @@ -34,7 +34,7 @@ #pragma omp declare target T a = T(); T f = a; -// CHECK: define{{ hidden | }}void @{{.+}}foo{{.+}}([[T]]* byval([[T]]) align {{.+}}) +// CHECK: define{{ hidden | }}void @{{.+}}foo{{.+}}([[T]] addrspace(101)* byref([[T]]) align {{.+}}) void foo(T a = T()) { return; } @@ -54,7 +54,7 @@ } T1 a1 = T1(); T1 f1 = a1; -// CHECK: define{{ hidden | }}void @{{.+}}foo1{{.+}}([[T1]]* byval([[T1]]) align {{.+}}) +// CHECK: define{{ hidden | }}void @{{.+}}foo1{{.+}}([[T1]] addrspace(101)* byref([[T1]]) align {{.+}}) void foo1(T1 a = T1()) { return; } diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -11,6 +11,7 @@ add_public_tablegen_target(NVPTXCommonTableGen) set(NVPTXCodeGen_sources + NVPTXAA.cpp NVPTXAllocaHoisting.cpp NVPTXAsmPrinter.cpp NVPTXAssignValidGlobalNames.cpp diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -46,6 +46,8 @@ FunctionPass *createNVPTXLowerAllocaPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); +ImmutablePass *createNVPTXAAWrapperPass(); +ImmutablePass *createNVPTXExternalAAWrapperPass(); namespace NVPTX { enum DrvInterface { diff --git a/llvm/lib/Target/NVPTX/NVPTXAA.cpp b/llvm/lib/Target/NVPTX/NVPTXAA.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/NVPTX/NVPTXAA.cpp @@ -0,0 +1,131 @@ +#include "MCTargetDesc/NVPTXBaseInfo.h" +#include "NVPTX.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Module.h" + +namespace llvm { +void initializeNVPTXAAWrapperPass(PassRegistry &); +void initializeNVPTXExternalAAWrapperPass(PassRegistry &); +} // namespace llvm + +#define DEBUG_TYPE "nvptx-aa" + +using namespace llvm; + +namespace { + +class NVPTXAAResult : public AAResultBase { + friend AAResultBase; + +public: + explicit NVPTXAAResult() : AAResultBase() {} + NVPTXAAResult(NVPTXAAResult &&Arg) : AAResultBase(std::move(Arg)) {} + + bool invalidate(Function &F, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &Inv); + + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB, + AAQueryInfo &AAQI) { + MemoryLocation L1 = LocA; + MemoryLocation L2 = LocB; + unsigned AS1 = L1.Ptr->getType()->getPointerAddressSpace(); + unsigned AS2 = L2.Ptr->getType()->getPointerAddressSpace(); + if (AS1 != ADDRESS_SPACE_GENERIC) { + std::swap(L1, L2); + std::swap(AS1, AS2); + } + if (AS1 == ADDRESS_SPACE_GENERIC) { + const auto *O1 = + getUnderlyingObject(L1.Ptr->stripPointerCastsAndInvariantGroups()); + AS1 = O1->getType()->getPointerAddressSpace(); + if (AS2 == ADDRESS_SPACE_GENERIC) { + const auto *O2 = + getUnderlyingObject(L1.Ptr->stripPointerCastsAndInvariantGroups()); + AS2 = O2->getType()->getPointerAddressSpace(); + } + if (AS1 == ADDRESS_SPACE_PARAM || AS2 == ADDRESS_SPACE_PARAM) { + if (AS1 != AS2) + return NoAlias; + // Fallback to the next alias analysis. + } else if (AS1 != ADDRESS_SPACE_GENERIC && AS2 != ADDRESS_SPACE_GENERIC) { + if (AS1 != AS2) + return NoAlias; + // Fallback to the next alias analysis. + } + } else if (AS1 != AS2) { + return NoAlias; + // Fallback to the next alias analysis. + } + // Query the next alias analysis. + return AAResultBase::alias(LocA, LocB, AAQI); + } + + bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI, + bool OrLocal) { + unsigned AS = Loc.Ptr->getType()->getPointerAddressSpace(); + // According to PTX ISA section 5.1.6.4, ``Function input parameters may be + // read via `ld.param` and function return parameters may be written using + // `st.param`; it is illegal to write to an input parameter or read from a + // return parameter.'' It's safe to assume that parameter memory space is + // constant. + if (AS == ADDRESS_SPACE_CONST || AS == ADDRESS_SPACE_PARAM) + return true; + return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal); + } +}; + +class NVPTXAAWrapper : public ImmutablePass { + std::unique_ptr Result; + +public: + static char ID; + + NVPTXAAWrapper() : ImmutablePass(ID) { + initializeNVPTXAAWrapperPass(*PassRegistry::getPassRegistry()); + } + + NVPTXAAResult &getResult() { return *Result; } + const NVPTXAAResult &getResult() const { return *Result; } + + bool doInitialization(Module &M) override { + Result.reset(new NVPTXAAResult()); + return false; + } + + bool doFinalization(Module &M) override { + Result.reset(); + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } +}; + +class NVPTXExternalAAWrapper : public ExternalAAWrapperPass { +public: + static char ID; + + NVPTXExternalAAWrapper() + : ExternalAAWrapperPass([](Pass &P, Function &F, AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable()) + AAR.addAAResult(WrapperPass->getResult()); + }) {} +}; + +} // End of anonymous namespace + +char NVPTXAAWrapper::ID = 0; +char NVPTXExternalAAWrapper::ID = 0; + +INITIALIZE_PASS(NVPTXAAWrapper, DEBUG_TYPE, "NVPTX AA Wrapper", true, true) +INITIALIZE_PASS(NVPTXExternalAAWrapper, "nvptx-external-aa-wrapper", + "NVPTX ExternalAA Wrapper", true, true) + +ImmutablePass *llvm::createNVPTXAAWrapperPass() { return new NVPTXAAWrapper(); } + +ImmutablePass *llvm::createNVPTXExternalAAWrapperPass() { + return new NVPTXExternalAAWrapper(); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1457,7 +1457,8 @@ } } - if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal)) { + if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal) && + !PAL.hasParamAttribute(paramIndex, Attribute::ByRef)) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { // Just print .param .align .b8 .param[size]; // = PAL.getparamalignment diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2531,7 +2531,8 @@ // to newly created nodes. The SDNodes for params have to // appear in the same order as their order of appearance // in the original function. "idx+1" holds that order. - if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { + if (!PAL.hasParamAttribute(i, Attribute::ByVal) && + !PAL.hasParamAttribute(i, Attribute::ByRef)) { bool aggregateIsPacked = false; if (StructType *STy = dyn_cast(Ty)) aggregateIsPacked = STy->isPacked(); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -198,11 +198,19 @@ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { Builder.addExtension( - PassManagerBuilder::EP_EarlyAsPossible, - [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { - PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); - PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); - }); + PassManagerBuilder::EP_EarlyAsPossible, + [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + PM.add(createNVPTXAAWrapperPass()); + PM.add(createNVPTXExternalAAWrapperPass()); + PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); + PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); + }); + Builder.addExtension( + PassManagerBuilder::EP_ModuleOptimizerEarly, + [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + PM.add(createNVPTXAAWrapperPass()); + PM.add(createNVPTXExternalAAWrapperPass()); + }); } TargetTransformInfo @@ -279,6 +287,9 @@ addStraightLineScalarOptimizationPasses(); } + addPass(createNVPTXAAWrapperPass()); + addPass(createNVPTXExternalAAWrapperPass()); + // === LSR and other generic IR passes === TargetPassConfig::addIRPasses(); // EarlyCSE is not always strong enough to clean up what LSR produces. For