Index: llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -61,26 +61,31 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { class AggBuffer { - // Used to buffer the emitted string for initializing global - // aggregates. + // Used to buffer the emitted string for initializing global aggregates. // - // Normally an aggregate (array, vector or structure) is emitted - // as a u8[]. However, if one element/field of the aggregate - // is a non-NULL address, then the aggregate is emitted as u32[] - // or u64[]. + // Normally an aggregate (array, vector, or structure) is emitted as a u8[]. + // However, if either element/field of the aggregate is a non-NULL address, + // and all such addresses are properly aligned, then the aggregate is + // emitted as u32[] or u64[]. In the case of unaligned addresses, the + // aggregate is emitted as u8[], and the mask() operator is used for all + // pointers. // - // We first layout the aggregate in 'buffer' in bytes, except for - // those symbol addresses. For the i-th symbol address in the - //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer' - // are filled with 0s. symbolPosInBuffer[i-1] records its position - // in 'buffer', and Symbols[i-1] records the Value*. + // We first layout the aggregate in 'buffer' in bytes, except for those + // symbol addresses. For the i-th symbol address in the aggregate, its + // corresponding 4-byte or 8-byte elements in 'buffer' are filled with 0s. + // symbolPosInBuffer[i-1] records its position in 'buffer', and Symbols[i-1] + // records the Value*. // - // Once we have this AggBuffer setup, we can choose how to print - // it out. + // Once we have this AggBuffer setup, we can choose how to print it out. public: // number of symbol addresses unsigned numSymbols() const { return Symbols.size(); } + bool allSymbolsAligned(unsigned ptrSize) const { + return std::all_of(symbolPosInBuffer.begin(), symbolPosInBuffer.end(), + [=](unsigned pos) { return pos % ptrSize == 0; }); + } + private: const unsigned size; // size of the buffer in bytes std::vector buffer; // the buffer Index: llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -75,6 +75,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -1172,11 +1173,26 @@ bufferAggregateConstant(Initializer, &aggBuffer); if (aggBuffer.numSymbols()) { unsigned int ptrSize = MAI->getCodePointerSize(); - O << " .u" << ptrSize * 8 << " "; - getSymbol(GVar)->print(O, MAI); - O << "[" << ElementSize / ptrSize << "] = {"; - aggBuffer.printWords(O); - O << "}"; + if (ElementSize % ptrSize || + !aggBuffer.allSymbolsAligned(ptrSize)) { + // Print in bytes and use the mask() operator for pointers. + if (!STI.hasMaskOperator()) + report_fatal_error( + "initialized packed aggregate with pointers '" + + GVar->getName() + + "' requires at least PTX ISA version 7.1"); + O << " .u8 "; + getSymbol(GVar)->print(O, MAI); + O << "[" << ElementSize << "] = {"; + aggBuffer.printBytes(O); + O << "}"; + } else { + O << " .u" << ptrSize * 8 << " "; + getSymbol(GVar)->print(O, MAI); + O << "[" << ElementSize / ptrSize << "] = {"; + aggBuffer.printWords(O); + O << "}"; + } } else { O << " .b8 "; getSymbol(GVar)->print(O, MAI); @@ -1233,10 +1249,32 @@ } void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) { + unsigned int ptrSize = AP.MAI->getCodePointerSize(); + symbolPosInBuffer.push_back(size); + unsigned int nSym = 0; + unsigned int nextSymbolPos = symbolPosInBuffer[nSym]; for (unsigned int pos = 0; pos < size; ++pos) { if (pos) os << ", "; - os << (unsigned int)buffer[pos]; + if (pos != nextSymbolPos) { + os << (unsigned int)buffer[pos]; + continue; + } + // Generate a per-byte mask() operator for the symbol, which looks like: + // .global .u8 addr[] = {0xFF(foo), 0xFF00(foo), 0xFF0000(foo), ...}; + // See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers + std::string symText; + llvm::raw_string_ostream oss(symText); + printSymbol(nSym, oss); + for (unsigned i = 0; i < ptrSize; ++i) { + if (i) + os << ", "; + llvm::write_hex(os, 0xFFULL << i * 8, HexPrintStyle::PrefixUpper); + os << "(" << symText << ")"; + } + pos += ptrSize - 1; + nextSymbolPos = symbolPosInBuffer[++nSym]; + assert(pos < nextSymbolPos); } } Index: llvm/lib/Target/NVPTX/NVPTXSubtarget.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -77,6 +77,7 @@ bool hasImageHandles() const; bool hasFP16Math() const { return SmVersion >= 53; } bool allowFP16Math() const; + bool hasMaskOperator() const { return PTXVersion >= 71; } unsigned int getSmVersion() const { return SmVersion; } std::string getTargetName() const { return TargetName; } Index: llvm/test/CodeGen/NVPTX/packed-aggr.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/packed-aggr.ll @@ -0,0 +1,95 @@ +; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx70 2>&1 | \ +; RUN: FileCheck %s --check-prefix=ERR +; ERR: initialized packed aggregate with pointers 's1' requires at least PTX ISA version 7.1 + +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,CHECK32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,CHECK64 +; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} +; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} + +;; Test that packed structs with symbol references are represented using the +;; mask() operator. + +declare void @func() +@p = addrspace(1) global i8 0 +; CHECK: .extern .func func +; CHECK: .u8 p; + +%t1 = type <{ i16, i8*, i8, void ()*, i8*, i32 }> +@s1 = addrspace(1) global %t1 <{ +; CHECK32: .global .align 1 .u8 s1[19] = { +; CHECK64: .global .align 1 .u8 s1[31] = { + i16 12, +; CHECK-SAME: 12, 0, + i8* addrspacecast (i8 addrspace(1)* @p to i8*), +; CHECK-SAME: 0xFF(generic(p)), 0xFF00(generic(p)), 0xFF0000(generic(p)), 0xFF000000(generic(p)), +; CHECK64-SAME: 0xFF00000000(generic(p)), 0xFF0000000000(generic(p)), 0xFF000000000000(generic(p)), 0xFF00000000000000(generic(p)), + i8 34, +; CHECK-SAME: 34 + void ()* @func, +; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), +; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func), + i8* addrspacecast (i8 addrspace(1)* getelementptr (i8, i8 addrspace(1)* @p, i32 3) to i8*), +; CHECK-SAME: 0xFF(generic(p)+3), 0xFF00(generic(p)+3), 0xFF0000(generic(p)+3), 0xFF000000(generic(p)+3), +; CHECK64-SAME: 0xFF00000000(generic(p)+3), 0xFF0000000000(generic(p)+3), 0xFF000000000000(generic(p)+3), 0xFF00000000000000(generic(p)+3), + i32 56 }>, align 1 +; CHECK-SAME: 56, 0, 0, 0}; + +;; Test a case than an unaligned pointer is in a nested struct. + +%t2i = type <{ void ()* }> +%t2o = type { i8, %t2i, i32 } +@s2 = addrspace(1) global %t2o { +; CHECK32: .global .align 8 .u8 s2[12] = { +; CHECK64: .global .align 8 .u8 s2[16] = { + i8 12, +; CHECK-SAME: 12, + %t2i <{ void()* @func }>, +; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), +; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func), + i32 34} +; CHECK-SAME: 0, 0, 0, +; CHECK-SAME: 34, 0, 0, 0}; + +;; Test that a packed struct which size is not multiple of the pointer size +;; is printed in bytes and uses the mask() operator for pointers even though +;; the pointers are aligned. + +%t3 = type <{ void ()*, i8 }> +@s3 = addrspace(1) global %t3 <{ +; CHECK32: .global .align 1 .u8 s3[5] = { +; CHECK64: .global .align 1 .u8 s3[9] = { + void ()* @func, +; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), +; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func), + i8 56 }>, align 1 +; CHECK-SAME: 56}; + +;; Test that a packed struct with aligned pointers is printed in words. + +%t4 = type <{ void ()*, i64 }> +@s4 = addrspace(1) global %t4 <{ +; CHECK32: .global .align 1 .u32 s4[3] = { +; CHECK64: .global .align 1 .u64 s4[2] = { + void()* @func, +; CHECK-SAME: func, + i64 15}>, align 1 +; CHECK32-SAME: 15, 0}; +; CHECK64-SAME: 15}; + +;; Test that a packed struct with unaligned pointers inside an array is handled. + +%t5 = type <{ void ()*, i16 }> +@a5 = addrspace(1) global [2 x %t5] [%t5 <{ void()* @func, i16 5 }>, %t5 <{ void()* @func, i16 9 }> ] +; CHECK32: .global .align 8 .u8 a5[12] = { +; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 5, 0, +; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 9, 0}; +; CHECK64: .global .align 8 .u8 a5[20] = { +; CHECK64-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), +; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func), +; CHECK64-SAME: 5, 0, +; CHECK64-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), +; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func), +; CHECK64-SAME: 9, 0};