diff --git a/mlir/docs/TargetLLVMIR.md b/mlir/docs/TargetLLVMIR.md --- a/mlir/docs/TargetLLVMIR.md +++ b/mlir/docs/TargetLLVMIR.md @@ -420,7 +420,7 @@ #### Default Calling Convention for Unranked MemRef For unranked memrefs, the list of function arguments always contains two -elements, same as the unranked memref descriptor: an integer rank, and a +elements, the same as the unranked memref descriptor: an integer rank, and a type-erased (`!llvm<"i8*">`) pointer to the ranked memref descriptor. Note that while the *calling convention* does not require allocation, *casting* to unranked memref does since one cannot take an address of an SSA value containing @@ -486,6 +486,49 @@ of the ranked memref descriptor pointed to by an unranked memref descriptor being stored on stack is respected. +**Descriptor buffer arguments.** Functions that return unranked memref +descriptors take one additional buffer argument per unranked result. On return, +these are used to hold the results' inner memref descriptors for *small* ranks +(up to 8 by default). This optimization avoids unnecessary calls to `malloc` and +`free`, which are otherwise necessary at each function call site and return. In +case the result is of greater rank (and does not fit into the buffer), the +calling convention falls back to heap allocation. + +```mlir +llvm.func @bar() { + %0 = call @foo() : () -> (memref<*xf32>) + "use"(%0) : (memref<*xf32>) -> () + return +} + +// Gets converted to the following. + +llvm.func @bar() { + %0 = llvm.mlir.constant(152 : index) : i64 + %1 = llvm.alloca %0 x i8 : (i64) -> !llvm.ptr + %2 = llvm.call @foo(%1) : (!llvm.ptr) -> !llvm.struct<(i64, ptr)> + %3 = llvm.mlir.constant(8 : i64) : i64 + %4 = llvm.extractvalue %2[0] : !llvm.struct<(i64, ptr)> + %5 = llvm.icmp "ule" %4, %3 : i64 + llvm.cond_br %5, ^bb1(%2 : !llvm.struct<(i64, ptr)>), ^bb2 +^bb1(%6: !llvm.struct<(i64, ptr)>): + "use"(%6) + llvm.return +^bb2: + %17 = ... // compute the size for the inner descriptor. + %18 = llvm.alloca %17 x i8 : (i64) -> !llvm.ptr + %19 = llvm.extractvalue %2[1] : !llvm.struct<(i64, ptr)> + %20 = llvm.mlir.constant(false) : i1 + "llvm.intr.memcpy"(%18, %19, %17, %20) : (!llvm.ptr, !llvm.ptr, i64, i1) -> () + llvm.call @free(%19) : (!llvm.ptr) -> () + %21 = llvm.mlir.undef : !llvm.struct<(i64, ptr)> + %22 = llvm.extractvalue %2[0] : !llvm.struct<(i64, ptr)> + %23 = llvm.insertvalue %22, %21[0] : !llvm.struct<(i64, ptr)> + %24 = llvm.insertvalue %18, %23[1] : !llvm.struct<(i64, ptr)> + llvm.br ^bb1(%24 : !llvm.struct<(i64, ptr)>) +} +``` + #### Bare Pointer Calling Convention for Ranked MemRef The "bare pointer" calling convention converts `memref`-typed function arguments @@ -590,7 +633,8 @@ 2. Add a body to the original function (making it non-external) that 1. allocates memref descriptors, 2. populates them, - 3. potentially allocates space for the result struct, and + 3. potentially allocates space for the result struct (also holding any + descriptor buffers for unranked memref results if needed), and 4. passes the pointers to these into the newly declared interface function, then 5. collects the result of the call (potentially from the result struct), @@ -604,10 +648,11 @@ as usual. Results are converted to a special argument if they are of struct type. 2. Populate the body of the newly defined function with IR that - 1. loads descriptors from pointers; - 2. unpacks descriptor into individual non-aggregate values; - 3. passes these values into the original function; - 4. collects the results of the call and + 1. loads descriptors from pointers, + 2. unpacks descriptor into individual non-aggregate values (also inner + desriptor buffer if needed), + 3. passes these values into the original function, then + 4. collects the results of the call, and 5. either copies the results into the result struct or returns them to the caller. diff --git a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h --- a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h @@ -35,6 +35,10 @@ bool useBarePtrCallConv = false; bool emitCWrappers = false; + // Specifies the maximum rank for which the calling convention will realize + // stack-allocated buffers for unranked memory descriptior results. + int64_t maxUnrankedDescBufferRank = 8; + enum class AllocLowering { /// Use malloc for for heap allocations. Malloc, diff --git a/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h b/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h --- a/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h @@ -173,12 +173,12 @@ /// `unpack`. static unsigned getNumUnpackedValues() { return 2; } - /// Builds IR computing the sizes in bytes (suitable for opaque allocation) - /// and appends the corresponding values into `sizes`. - static void computeSizes(OpBuilder &builder, Location loc, - LLVMTypeConverter &typeConverter, - ArrayRef values, - SmallVectorImpl &sizes); + /// Builds IR computing the size in bytes (suitable for opaque allocation). + Value computeSize(OpBuilder &builder, Location loc, + LLVMTypeConverter &typeConverter); + + // Returns the size in bytes (suitable for opaque allocation). + static int64_t getSize(LLVMTypeConverter &typeConverter, int64_t rank); /// TODO: The following accessors don't take alignment rules between elements /// of the descriptor struct into account. For some architectures, it might be diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h --- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h @@ -116,14 +116,30 @@ ArrayRef sizes, ArrayRef strides, ConversionPatternRewriter &rewriter) const; - /// Copies the memory descriptor for any operands that were unranked - /// descriptors originally to heap-allocated memory (if toDynamic is true) or - /// to stack-allocated memory (otherwise). Also frees the previously used - /// memory (that is assumed to be heap-allocated) if toDynamic is false. - LogicalResult copyUnrankedDescriptors(OpBuilder &builder, Location loc, - TypeRange origTypes, - SmallVectorImpl &operands, - bool toDynamic) const; + /// Ensures that all unranked memory descriptors are on the stack. + /// This concerns the dynamically sized inner descriptors. If their rank is + /// sufficiently small, we know that they reside in stack-allocated buffers + /// already. Otherwise, if they are of a rank greater than the maximum rank + /// for stack-allocated descriptor buffers, they reside on the heap. In this + /// case, we have to copy them over to a newly stack-allocated buffer of the + /// right size and free the previously used buffer on the heap. + void copyUnrankedDescriptorsToStack(ConversionPatternRewriter &rewriter, + Location loc, int64_t maxRankOnStack, + TypeRange origTypes, + SmallVectorImpl &operands) const; + + /// Copies all unranked memory descriptors, using the given buffer arguments + /// or newly heap-allocated memory for the inner descriptors. This is to let + /// unranked memory descriptors escape a function. If their rank is + /// sufficiently small, we assume that their inner descriptor fits into the + /// provided buffer. Otherwise, if they are of a rank greater than the maximum + /// rank for stack-allocated descriptor buffers, we allocate a new buffer on + /// the heap. In both cases, we copy the inner descriptor and create a copy of + /// the unranked outer descriptor. + void copyUnrankedDescriptorsToBufferOrHeap( + ConversionPatternRewriter &rewriter, Location loc, int64_t maxRankOnStack, + TypeRange origTypes, ArrayRef descBuffers, + SmallVectorImpl &operands) const; }; /// Utility class for operation conversions targeting the LLVM dialect that diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -522,6 +522,11 @@ Option<"emitCWrappers", "emit-c-wrappers", "bool", /*default=*/"false", "Emit wrappers for C-compatible pointer-to-struct memref " "descriptors">, + Option<"maxUnrankedDescBufferRank", "max-unranked-desc-buffer-rank", + "int64_t", /*default=*/"8", + "Specifies the maximum rank for which the calling convention will " + "realize stack-allocated buffers for unranked memory descriptior " + "results.">, Option<"indexBitwidth", "index-bitwidth", "unsigned", /*default=kDeriveIndexBitwidthFromDataLayout*/"0", "Bitwidth of the index type, 0 to use size of machine word">, diff --git a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp --- a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp +++ b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp @@ -333,16 +333,11 @@ results.push_back(d.memRefDescPtr(builder, loc)); } -void UnrankedMemRefDescriptor::computeSizes( - OpBuilder &builder, Location loc, LLVMTypeConverter &typeConverter, - ArrayRef values, SmallVectorImpl &sizes) { - if (values.empty()) - return; +Value UnrankedMemRefDescriptor::computeSize(OpBuilder &builder, Location loc, + LLVMTypeConverter &typeConverter) { - // Cache the index type. + // Get constants. Type indexType = typeConverter.getIndexType(); - - // Initialize shared constants. Value one = createIndexAttrConstant(builder, loc, indexType, 1); Value two = createIndexAttrConstant(builder, loc, indexType, 2); Value pointerSize = createIndexAttrConstant( @@ -351,31 +346,35 @@ createIndexAttrConstant(builder, loc, indexType, ceilDiv(typeConverter.getIndexTypeBitwidth(), 8)); - sizes.reserve(sizes.size() + values.size()); - for (UnrankedMemRefDescriptor desc : values) { - // Emit IR computing the memory necessary to store the descriptor. This - // assumes the descriptor to be - // { type*, type*, index, index[rank], index[rank] } - // and densely packed, so the total size is - // 2 * sizeof(pointer) + (1 + 2 * rank) * sizeof(index). - // TODO: consider including the actual size (including eventual padding due - // to data layout) into the unranked descriptor. - Value doublePointerSize = - builder.create(loc, indexType, two, pointerSize); - - // (1 + 2 * rank) * sizeof(index) - Value rank = desc.rank(builder, loc); - Value doubleRank = builder.create(loc, indexType, two, rank); - Value doubleRankIncremented = - builder.create(loc, indexType, doubleRank, one); - Value rankIndexSize = builder.create( - loc, indexType, doubleRankIncremented, indexSize); - - // Total allocation size. - Value allocationSize = builder.create( - loc, indexType, doublePointerSize, rankIndexSize); - sizes.push_back(allocationSize); - } + // Emit IR computing the memory necessary to store the descriptor. This + // assumes the descriptor to be + // { type*, type*, index, index[rank], index[rank] } + // and densely packed, so the total size is + // 2 * sizeof(pointer) + (1 + 2 * rank) * sizeof(index). + // TODO: consider including the actual size (including eventual padding due + // to data layout) into the unranked descriptor. + + // 2 * sizeof(pointer) + Value doublePointerSize = + builder.create(loc, indexType, two, pointerSize); + + // (1 + 2 * rank) * sizeof(index) + Value rank = this->rank(builder, loc); + Value doubleRank = builder.create(loc, indexType, two, rank); + Value doubleRankIncremented = + builder.create(loc, indexType, doubleRank, one); + Value rankIndexSize = builder.create( + loc, indexType, doubleRankIncremented, indexSize); + + return builder.create(loc, indexType, doublePointerSize, + rankIndexSize); +} + +int64_t UnrankedMemRefDescriptor::getSize(LLVMTypeConverter &typeConverter, + int64_t rank) { + int64_t ptrSize = ceilDiv(typeConverter.getPointerBitwidth(), 8); + int64_t indexSize = ceilDiv(typeConverter.getIndexTypeBitwidth(), 8); + return 2 * ptrSize + (1 + 2 * rank) * indexSize; } Value UnrankedMemRefDescriptor::allocatedPtr(OpBuilder &builder, Location loc, diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp --- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp +++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp @@ -225,81 +225,176 @@ return memRefDescriptor; } -LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors( - OpBuilder &builder, Location loc, TypeRange origTypes, - SmallVectorImpl &operands, bool toDynamic) const { - assert(origTypes.size() == operands.size() && - "expected as may original types as operands"); - - // Find operands of unranked memref type and store them. - SmallVector unrankedMemrefs; - for (unsigned i = 0, e = operands.size(); i < e; ++i) - if (origTypes[i].isa()) - unrankedMemrefs.emplace_back(operands[i]); - - if (unrankedMemrefs.empty()) - return success(); - - // Compute allocation sizes. - SmallVector sizes; - UnrankedMemRefDescriptor::computeSizes(builder, loc, *getTypeConverter(), - unrankedMemrefs, sizes); - - // Get frequently used types. - MLIRContext *context = builder.getContext(); - Type voidPtrType = LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); - auto i1Type = IntegerType::get(context, 1); - Type indexType = getTypeConverter()->getIndexType(); - - // Find the malloc and free, or declare them if necessary. - auto module = builder.getInsertionPoint()->getParentOfType(); - LLVM::LLVMFuncOp freeFunc, mallocFunc; - if (toDynamic) - mallocFunc = LLVM::lookupOrCreateMallocFn(module, indexType); - if (!toDynamic) - freeFunc = LLVM::lookupOrCreateFreeFn(module); - - // Initialize shared constants. - Value zero = - builder.create(loc, i1Type, builder.getBoolAttr(false)); - - unsigned unrankedMemrefPos = 0; - for (unsigned i = 0, e = operands.size(); i < e; ++i) { - Type type = origTypes[i]; - if (!type.isa()) +void ConvertToLLVMPattern::copyUnrankedDescriptorsToStack( + ConversionPatternRewriter &rewriter, Location loc, int64_t maxRankOnStack, + TypeRange origTypes, SmallVectorImpl &operands) const { + + // Check if there is any unranked operand to avoid shared constants. + if (llvm::none_of(origTypes, + [](Type ty) { return ty.isa(); })) { + return; + } + + OpBuilder::InsertionGuard guard(rewriter); + + // Find the free function. + auto module = rewriter.getInsertionPoint()->getParentOfType(); + LLVM::LLVMFuncOp freeFunc = LLVM::lookupOrCreateFreeFn(module); + + // Get common types and constants. + Type voidPtrTy = this->getVoidPtrType(); + Type i1Ty = rewriter.getI1Type(); + Value maxRankOnStackCst = rewriter.create( + loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(maxRankOnStack)); + + for (unsigned i = 0; i < operands.size(); i++) { + + // Only copy unranked descriptors. + if (!origTypes[i].isa()) continue; - Value allocationSize = sizes[unrankedMemrefPos++]; - UnrankedMemRefDescriptor desc(operands[i]); - // Allocate memory, copy, and free the source if necessary. - Value memory = - toDynamic - ? builder.create(loc, mallocFunc, allocationSize) - .getResult(0) - : builder.create(loc, voidPtrType, allocationSize, - /*alignment=*/0); - Value source = desc.memRefDescPtr(builder, loc); - builder.create(loc, memory, source, allocationSize, zero); - if (!toDynamic) - builder.create(loc, freeFunc, source); + // Split the block to insert descriptor copying logic. + Block *origBlock = rewriter.getBlock(); + Block *continuationBlock = + rewriter.splitBlock(origBlock, rewriter.getInsertionPoint()); + Type descTy = getTypeConverter()->convertType(origTypes[i]); + continuationBlock->addArgument(descTy); + + // Generate the block for large ranks. + // This is the case in which we expect the inner descriptor in dynamic + // memory. We copy it to stack-allocated memory and free the original + // inner descriptor before creating the outer descriptor copy. + Block *largeRankBlock = rewriter.createBlock(origBlock->getParent()); + + // Copy inner descriptor to stack. + UnrankedMemRefDescriptor desc(operands[i]); + Value allocationSize = desc.computeSize(rewriter, loc, *getTypeConverter()); + Value innerDescCpy = rewriter.create( + loc, voidPtrTy, allocationSize, /*alignment=*/0); + Value innerDesc = desc.memRefDescPtr(rewriter, loc); + Value zero = rewriter.create(loc, i1Ty, + rewriter.getBoolAttr(false)); + rewriter.create(loc, innerDescCpy, innerDesc, + allocationSize, zero); + rewriter.create(loc, freeFunc, innerDesc); // Create a new descriptor. The same descriptor can be returned multiple // times, attempting to modify its pointer can lead to memory leaks // (allocated twice and overwritten) or double frees (the caller does not // know if the descriptor points to the same memory). - Type descriptorType = getTypeConverter()->convertType(type); - if (!descriptorType) - return failure(); - auto updatedDesc = - UnrankedMemRefDescriptor::undef(builder, loc, descriptorType); - Value rank = desc.rank(builder, loc); - updatedDesc.setRank(builder, loc, rank); - updatedDesc.setMemRefDescPtr(builder, loc, memory); + auto descCpy = UnrankedMemRefDescriptor::undef(rewriter, loc, descTy); + descCpy.setRank(rewriter, loc, desc.rank(rewriter, loc)); + descCpy.setMemRefDescPtr(rewriter, loc, innerDescCpy); + + // Propagate the new descriptor. + rewriter.create(loc, Value(descCpy), continuationBlock); + + // Generate the condition to decide if the inner descriptor is already on + // the stack (for small ranks) or if we have to copy it over (for large + // ranks). + rewriter.setInsertionPointToEnd(origBlock); + Value rank = desc.rank(rewriter, loc); + Value pred = rewriter.create(loc, LLVM::ICmpPredicate::ule, + rank, maxRankOnStackCst); + rewriter.create(loc, pred, continuationBlock, operands[i], + largeRankBlock, ValueRange{}); + + // Continue with the original descriptor or its on-stack copy, which are + // passed as a block argument. + rewriter.setInsertionPointToStart(continuationBlock); + operands[i] = continuationBlock->getArgument(0); + } +} + +void ConvertToLLVMPattern::copyUnrankedDescriptorsToBufferOrHeap( + ConversionPatternRewriter &rewriter, Location loc, int64_t maxRankOnStack, + TypeRange origTypes, ArrayRef descBuffers, + SmallVectorImpl &operands) const { - operands[i] = updatedDesc; + // Check if there is any unranked operand to avoid shared constants. + if (llvm::none_of(origTypes, + [](Type ty) { return ty.isa(); })) { + return; } - return success(); + OpBuilder::InsertionGuard guard(rewriter); + + // Get common types and constants. + Type indexTy = getTypeConverter()->getIndexType(); + Type voidPtrTy = LLVM::LLVMPointerType::get(rewriter.getI8Type()); + Type i1Ty = rewriter.getI1Type(); + Value maxRankOnStackCst = rewriter.create( + loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(maxRankOnStack)); + + // Find the malloc function. + auto module = rewriter.getInsertionPoint()->getParentOfType(); + LLVM::LLVMFuncOp mallocFunc = LLVM::lookupOrCreateMallocFn(module, indexTy); + + unsigned nextBuffer = 0; + for (unsigned i = 0; i < operands.size(); i++) { + + // Only copy unranked descriptors. + if (!origTypes[i].isa()) + continue; + + // Compute the size of the inner descriptor for allocation and copying. + UnrankedMemRefDescriptor desc(operands[i]); + Value allocationSize = desc.computeSize(rewriter, loc, *getTypeConverter()); + + // Split the block to insert descriptor copying logic. + Block *origBlock = rewriter.getBlock(); + Block *continuationBlock = + rewriter.splitBlock(origBlock, rewriter.getInsertionPoint()); + continuationBlock->addArgument(voidPtrTy); + + // Generate the block for small ranks. + // This is the case in which we can copy the inner descriptor to the + // available buffer. + Block *smallRankBlock = rewriter.createBlock(origBlock->getParent()); + Value buffer = descBuffers[nextBuffer++]; + rewriter.create(loc, buffer, continuationBlock); + + // Generate the block for large ranks. + // This is the case in which we copy the inner descriptor to heap-allocated + // memory as the available buffer is too small. + Block *largeRankBlock = rewriter.createBlock(origBlock->getParent()); + Value newBuffer = + rewriter.create(loc, mallocFunc, allocationSize) + .getResult(0); + rewriter.create(loc, newBuffer, continuationBlock); + + // Generate the condition to decide if the inner descriptor can be copied to + // the available buffer (for small ranks) or if we need a bigger one (for + // large ranks). + rewriter.setInsertionPointToEnd(origBlock); + Value rank = desc.rank(rewriter, loc); + Value pred = rewriter.create(loc, LLVM::ICmpPredicate::ule, + rank, maxRankOnStackCst); + rewriter.create(loc, pred, smallRankBlock, largeRankBlock); + + // Continue with the selected buffer for the inner descriptor copy, which is + // passed as a block argument. + rewriter.setInsertionPointToStart(continuationBlock); + Value innerDescCpy = continuationBlock->getArgument(0); + + // Copy the inner descriptor to the new buffer. + Value innerDesc = desc.memRefDescPtr(rewriter, loc); + Value zero = rewriter.create(loc, i1Ty, + rewriter.getBoolAttr(false)); + rewriter.create(loc, innerDescCpy, innerDesc, + allocationSize, zero); + + // Create a new descriptor. The same descriptor can be returned multiple + // times, attempting to modify its pointer can lead to memory leaks + // (allocated twice and overwritten) or double frees (the caller does not + // know if the descriptor points to the same memory). + Type descTy = getTypeConverter()->convertType(origTypes[i]); + auto descCpy = UnrankedMemRefDescriptor::undef(rewriter, loc, descTy); + descCpy.setRank(rewriter, loc, rank); + descCpy.setMemRefDescPtr(rewriter, loc, innerDescCpy); + + operands[i] = descCpy; + } } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp --- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp +++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp @@ -128,10 +128,14 @@ return LLVM::LLVMPointerType::get(converted); } -// Function types are converted to LLVM Function types by recursively converting -// argument and result types. If MLIR Function has zero results, the LLVM -// Function has one VoidType result. If MLIR Function has more than one result, -// they are into an LLVM StructType in their order of appearance. +// Function types are converted to LLVM function types by elementwise converting +// argument and result types. If the MLIR function has zero results, the LLVM +// function has one VoidType result. If the MLIR function has more than one +// result, they are packed into an LLVM StructType in their order of appearance. +// For every unranked memref result of the MLIR function, the LLVM function +// expects one preceeding buffer argument. These are used to avoid dynamic +// memory allocation for the inner descriptors if their rank is suffiently small +// (see option max-unranked-desc-buffer-rank). Type LLVMTypeConverter::convertFunctionSignature( FunctionType funcTy, bool isVariadic, LLVMTypeConverter::SignatureConversion &result) { @@ -150,6 +154,16 @@ SmallVector argTypes; argTypes.reserve(llvm::size(result.getConvertedTypes())); + + // Add one void ptr per unranked result. These are used to pass buffers for + // the inner descriptors. + auto voidPtrTy = + LLVM::LLVMPointerType::get(IntegerType::get(&getContext(), 8)); + for (Type ty : funcTy.getResults()) { + if (ty.isa()) + argTypes.push_back(voidPtrTy); + } + for (Type type : result.getConvertedTypes()) argTypes.push_back(type); diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp --- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp +++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp @@ -893,11 +893,10 @@ auto targetDesc = UnrankedMemRefDescriptor::undef( rewriter, loc, typeConverter->convertType(targetType)); targetDesc.setRank(rewriter, loc, resultRank); - SmallVector sizes; - UnrankedMemRefDescriptor::computeSizes(rewriter, loc, *getTypeConverter(), - targetDesc, sizes); + Value allocationSize = + targetDesc.computeSize(rewriter, loc, *getTypeConverter()); Value underlyingDescPtr = rewriter.create( - loc, getVoidPtrType(), sizes.front(), llvm::None); + loc, getVoidPtrType(), allocationSize, llvm::None); targetDesc.setMemRefDescPtr(rewriter, loc, underlyingDescPtr); // Extract pointers and offset from the source memref. diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp --- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp +++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp @@ -41,7 +41,6 @@ #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" -#include using namespace mlir; @@ -90,7 +89,38 @@ OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointToStart(wrapperFuncOp.addEntryBlock()); + // If any of the results is an unranked descriptor, extract the pre-allocated + // buffers form the result prts and pass them on as individual preceeding + // arguments. SmallVector args; + if (resultIsNowArg) { + Value resultPtr = wrapperFuncOp.getArgument(0); + if (type.getNumResults() == 1 && + type.getResults().front().isa()) { + Value loaded = rewriter.create(loc, resultPtr); + UnrankedMemRefDescriptor unrankedDescr(loaded); + Value innerDescrPtr = unrankedDescr.memRefDescPtr(rewriter, loc); + args.push_back(innerDescrPtr); + } else if (type.getNumResults() > 1 && + llvm::any_of(type.getResults(), [](Type ty) { + return ty.isa(); + })) { + Value loaded = rewriter.create(loc, resultPtr); + for (auto it : llvm::enumerate(type.getResults())) { + if (it.value().isa()) { + Type resultTy = loaded.getType() + .cast() + .getBody()[it.index()]; + Value loadedResult = rewriter.create( + loc, resultTy, loaded, rewriter.getI64ArrayAttr(it.index())); + UnrankedMemRefDescriptor unrankedDescr(loadedResult); + Value innerDescrPtr = unrankedDescr.memRefDescPtr(rewriter, loc); + args.push_back(innerDescrPtr); + } + } + } + } + size_t argOffset = resultIsNowArg ? 1 : 0; for (auto &en : llvm::enumerate(type.getInputs())) { Value arg = wrapperFuncOp.getArgument(en.index() + argOffset); @@ -119,15 +149,15 @@ } } -/// Creates an auxiliary function with pointer-to-memref-descriptor-struct -/// arguments instead of unpacked arguments. Creates a body for the (external) -/// `newFuncOp` that allocates a memref descriptor on stack, packs the -/// individual arguments into this descriptor and passes a pointer to it into -/// the auxiliary function. If the result of the function cannot be directly -/// returned, we write it to a special first argument that provides a pointer -/// to a corresponding struct. This auxiliary external function is now -/// compatible with functions defined in C using pointers to C structs -/// corresponding to a memref descriptor. +/// Creates an auxiliary function declaration with +/// pointer-to-memref-descriptor-struct arguments instead of unpacked arguments. +/// Creates a body for the (external) `newFuncOp` that allocates a memref +/// descriptor on stack, packs the individual arguments into this descriptor and +/// passes a pointer to it into the auxiliary function. If the result of the +/// function cannot be directly returned, we write it to a special first +/// argument that provides a pointer to a corresponding struct. This auxiliary +/// external function is now compatible with functions defined in C using +/// pointers to C structs corresponding to a memref descriptor. static void wrapExternalFunction(OpBuilder &builder, Location loc, LLVMTypeConverter &typeConverter, FuncOp funcOp, LLVM::LLVMFuncOp newFuncOp) { @@ -157,21 +187,54 @@ FunctionType type = funcOp.getType(); SmallVector args; args.reserve(type.getNumInputs()); - ValueRange wrapperArgsRange(newFuncOp.getArguments()); + + // Count the number of unranked results, which require special treatment. + int numUnrankedResults = llvm::count_if( + type.getResults(), [](Type ty) { return ty.isa(); }); if (resultIsNowArg) { + // Allocate the struct on the stack and pass the pointer. - Type resultType = + auto resultPtrTy = wrapperType.cast().getParamType(0); Value one = builder.create( loc, typeConverter.convertType(builder.getIndexType()), builder.getIntegerAttr(builder.getIndexType(), 1)); - Value result = builder.create(loc, resultType, one); - args.push_back(result); + Value resultPtr = builder.create(loc, resultPtrTy, one); + args.push_back(resultPtr); + + // If any of the results is an unranked descriptor, populate the + // pre-allocated result with the descriptor buffers that were passed as + // function arguments. + if (type.getNumResults() == 1 && + type.getResults().front().isa()) { + auto desc = UnrankedMemRefDescriptor::undef( + builder, loc, newFuncOp.getType().getReturnType()); + Value buffer = newFuncOp.getArgument(0); + desc.setMemRefDescPtr(builder, loc, buffer); + builder.create(loc, desc, resultPtr); + } else if (type.getNumResults() > 1 && numUnrankedResults > 0) { + int bufferIdx = 0; + Type resultTy = newFuncOp.getType().getReturnType(); + Value result = builder.create(loc, resultTy); + for (auto it : llvm::enumerate(type.getResults())) { + if (auto unrankedMemRefTy = it.value().dyn_cast()) { + Type descTy = typeConverter.convertType(unrankedMemRefTy); + auto desc = UnrankedMemRefDescriptor::undef(builder, loc, descTy); + Value buffer = newFuncOp.getArgument(bufferIdx++); + desc.setMemRefDescPtr(builder, loc, buffer); + result = builder.create( + loc, resultTy, result, desc, builder.getI64ArrayAttr(it.index())); + } + } + builder.create(loc, result, resultPtr); + } } // Iterate over the inputs of the original function and pack values into // memref descriptors if the original type is a memref. + ValueRange wrapperArgsRange( + newFuncOp.getArguments().drop_front(numUnrankedResults)); for (auto &en : llvm::enumerate(type.getInputs())) { Value arg; int numToDrop = 1; @@ -281,8 +344,21 @@ rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(), newFuncOp.end()); if (failed(rewriter.convertRegionTypes(&newFuncOp.getBody(), *typeConverter, - &result))) + &result))) { return nullptr; + } + + // For every unranked result, add a preceeding void ptr argument to pass the + // descriptor buffer. + if (!newFuncOp.getBody().empty()) { + auto loc = funcOp.getLoc(); + Block &entryBlock = newFuncOp.getBody().front(); + auto voidPtrTy = getVoidPtrType(); + for (Type ty : funcOp.getType().getResults()) { + if (ty.isa()) + entryBlock.insertArgument(static_cast(0), voidPtrTy, loc); + } + } return newFuncOp; } @@ -305,12 +381,13 @@ if (getTypeConverter()->getOptions().emitCWrappers || funcOp->getAttrOfType(kEmitIfaceAttrName)) { - if (newFuncOp.isExternal()) + if (newFuncOp.isExternal()) { wrapExternalFunction(rewriter, funcOp.getLoc(), *getTypeConverter(), funcOp, newFuncOp); - else + } else { wrapForExternalCallers(rewriter, funcOp.getLoc(), *getTypeConverter(), funcOp, newFuncOp); + } } rewriter.eraseOp(funcOp); @@ -526,23 +603,45 @@ LogicalResult matchAndRewrite(CallOpType callOp, typename CallOpType::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { + auto &typeConverter = *this->getTypeConverter(); + int64_t maxUnrankedDescBufferRank = + typeConverter.getOptions().maxUnrankedDescBufferRank; + // Pack the result types into a struct. Type packedResult = nullptr; unsigned numResults = callOp.getNumResults(); auto resultTypes = llvm::to_vector<4>(callOp.getResultTypes()); if (numResults != 0) { - if (!(packedResult = - this->getTypeConverter()->packFunctionResults(resultTypes))) + if (!(packedResult = typeConverter.packFunctionResults(resultTypes))) return failure(); } - auto promoted = this->getTypeConverter()->promoteOperands( + SmallVector args; + + // Create and pass a stack-allocated buffer for every unranked result. + int numUnrankedResults = + llvm::count_if(callOp.getResultTypes(), + [](Type ty) { return ty.isa(); }); + if (numUnrankedResults > 0) { + auto loc = callOp.getLoc(); + Value bufferSize = this->createIndexConstant( + rewriter, loc, + UnrankedMemRefDescriptor::getSize(typeConverter, + maxUnrankedDescBufferRank)); + for (int i = 0; i < numUnrankedResults; i++) { + args.push_back(rewriter.create( + callOp.getLoc(), this->getVoidPtrType(), bufferSize)); + } + } + + auto promoted = typeConverter.promoteOperands( callOp.getLoc(), /*opOperands=*/callOp->getOperands(), adaptor.getOperands(), rewriter); + args.append(promoted.begin(), promoted.end()); auto newOp = rewriter.create( callOp.getLoc(), packedResult ? TypeRange(packedResult) : TypeRange(), - promoted, callOp->getAttrs()); + args, callOp->getAttrs()); SmallVector results; if (numResults < 2) { @@ -553,25 +652,24 @@ // Extract individual results from the structure and return them as list. results.reserve(numResults); for (unsigned i = 0; i < numResults; ++i) { - auto type = - this->typeConverter->convertType(callOp.getResult(i).getType()); + auto type = typeConverter.convertType(callOp.getResult(i).getType()); results.push_back(rewriter.create( callOp.getLoc(), type, newOp->getResult(0), rewriter.getI64ArrayAttr(i))); } } - if (this->getTypeConverter()->getOptions().useBarePtrCallConv) { + if (typeConverter.getOptions().useBarePtrCallConv) { // For the bare-ptr calling convention, promote memref results to // descriptors. assert(results.size() == resultTypes.size() && "The number of arguments and types doesn't match"); - this->getTypeConverter()->promoteBarePtrsToDescriptors( - rewriter, callOp.getLoc(), resultTypes, results); - } else if (failed(this->copyUnrankedDescriptors(rewriter, callOp.getLoc(), - resultTypes, results, - /*toDynamic=*/false))) { - return failure(); + typeConverter.promoteBarePtrsToDescriptors(rewriter, callOp.getLoc(), + resultTypes, results); + } else { + this->copyUnrankedDescriptorsToStack(rewriter, callOp.getLoc(), + maxUnrankedDescBufferRank, + resultTypes, results); } rewriter.replaceOp(callOp, results); @@ -813,9 +911,14 @@ } } else { updatedOperands = llvm::to_vector<4>(adaptor.getOperands()); - (void)copyUnrankedDescriptors(rewriter, loc, op.getOperands().getTypes(), - updatedOperands, - /*toDynamic=*/true); + + auto funcOp = op->getParentOfType(); + auto descBuffers = llvm::to_vector<8>(llvm::map_range( + funcOp.getArguments(), [](BlockArgument a) { return Value(a); })); + copyUnrankedDescriptorsToBufferOrHeap( + rewriter, loc, + getTypeConverter()->getOptions().maxUnrankedDescBufferRank, + op.getOperands().getTypes(), descBuffers, updatedOperands); } // If ReturnOp has 0 or 1 operand, create it and return immediately. @@ -1190,10 +1293,11 @@ struct LLVMLoweringPass : public ConvertStandardToLLVMBase { LLVMLoweringPass() = default; LLVMLoweringPass(bool useBarePtrCallConv, bool emitCWrappers, - unsigned indexBitwidth, bool useAlignedAlloc, - const llvm::DataLayout &dataLayout) { + int64_t maxUnrankedDescBufferRank, unsigned indexBitwidth, + bool useAlignedAlloc, const llvm::DataLayout &dataLayout) { this->useBarePtrCallConv = useBarePtrCallConv; this->emitCWrappers = emitCWrappers; + this->maxUnrankedDescBufferRank = maxUnrankedDescBufferRank; this->indexBitwidth = indexBitwidth; this->dataLayout = dataLayout.getStringRepresentation(); } @@ -1222,6 +1326,7 @@ dataLayoutAnalysis.getAtOrAbove(m)); options.useBarePtrCallConv = useBarePtrCallConv; options.emitCWrappers = emitCWrappers; + options.maxUnrankedDescBufferRank = maxUnrankedDescBufferRank; if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) options.overrideIndexBitwidth(indexBitwidth); options.dataLayout = llvm::DataLayout(this->dataLayout); @@ -1257,5 +1362,6 @@ (allocLowering == LowerToLLVMOptions::AllocLowering::AlignedAlloc); return std::make_unique( options.useBarePtrCallConv, options.emitCWrappers, - options.getIndexBitwidth(), useAlignedAlloc, options.dataLayout); + options.maxUnrankedDescBufferRank, options.getIndexBitwidth(), + useAlignedAlloc, options.dataLayout); } diff --git a/mlir/test/Conversion/StandardToLLVM/calling-convention-dbg.mlir b/mlir/test/Conversion/StandardToLLVM/calling-convention-dbg.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/StandardToLLVM/calling-convention-dbg.mlir @@ -0,0 +1,9 @@ +// RUN: mlir-opt %s \ +// RUN: --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm --allow-unregistered-dialect + +func @bar() -> memref<*xf32> attributes { llvm.emit_c_interface } { + %0 = "get"() : () -> (memref<*xf32>) + return %0 : memref<*xf32> +} + diff --git a/mlir/test/Conversion/StandardToLLVM/calling-convention-external-c-function-callee.mlir b/mlir/test/Conversion/StandardToLLVM/calling-convention-external-c-function-callee.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/StandardToLLVM/calling-convention-external-c-function-callee.mlir @@ -0,0 +1,291 @@ +// RUN: mlir-opt %s \ +// RUN: --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm='max-unranked-desc-buffer-rank=5' | FileCheck %s + +func private @external_no_result(%arg0 : memref) + attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_no_result +// CHECK-SAME: %[[ALLOC:.*]]: !llvm.ptr, %[[ALIGN:.*]]: !llvm.ptr, %[[OFFSET:.*]]: i64, %[[SIZE0:.*]]: i64, %[[SIZE1:.*]]: i64, %[[STRIDE0:.*]]: i64, %[[STRIDE1:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[DESC0:.*]] = llvm.mlir.undef : [[DESC_TY:!llvm.struct<\(ptr, ptr, i64, array<2 x i64>, array<2 x i64>\)>]] +// CHECK: %[[DESC1:.*]] = llvm.insertvalue %[[ALLOC]], %[[DESC0]][0] +// CHECK: %[[DESC2:.*]] = llvm.insertvalue %[[ALIGN]], %[[DESC1]][1] +// CHECK: %[[DESC3:.*]] = llvm.insertvalue %[[OFFSET]], %[[DESC2]][2] +// CHECK: %[[DESC4:.*]] = llvm.insertvalue %[[SIZE0]], %[[DESC3]][3, 0] +// CHECK: %[[DESC5:.*]] = llvm.insertvalue %[[STRIDE0]], %[[DESC4]][4, 0] +// CHECK: %[[DESC6:.*]] = llvm.insertvalue %[[SIZE1]], %[[DESC5]][3, 1] +// CHECK: %[[DESC7:.*]] = llvm.insertvalue %[[STRIDE1]], %[[DESC6]][4, 1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG_PTR:.*]] = llvm.alloca %[[C1]] x [[DESC_TY]] +// CHECK: llvm.store %[[DESC7]], %[[ARG_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_no_result(%[[ARG_PTR]]) +// CHECK: llvm.return + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_no_result +// CHECK-SAME: (!llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>) + + +func private @external_single_result(%arg0 : memref) -> memref + attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_single_result +// CHECK-SAME: %[[ALLOC:.*]]: !llvm.ptr, %[[ALIGN:.*]]: !llvm.ptr, %[[OFFSET:.*]]: i64, %[[SIZE0:.*]]: i64, %[[STRIDE0:.*]]: i64 + +// Allocate result on stack. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[RESUT_PTR:.*]] = llvm.alloca %[[C1]] x [[RESULT_DESC_TY:!llvm.struct<\(ptr, ptr, i64, array<1 x i64>, array<1 x i64>\)>]] + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef : [[ARG_DESC_TY:!llvm.struct<\(ptr, ptr, i64, array<1 x i64>, array<1 x i64>\)>]] +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ALLOC]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ALIGN]], %[[ARG_DESC1]][1] +// CHECK: %[[ARG_DESC3:.*]] = llvm.insertvalue %[[OFFSET]], %[[ARG_DESC2]][2] +// CHECK: %[[ARG_DESC4:.*]] = llvm.insertvalue %[[SIZE0]], %[[ARG_DESC3]][3, 0] +// CHECK: %[[ARG_DESC5:.*]] = llvm.insertvalue %[[STRIDE0]], %[[ARG_DESC4]][4, 0] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG_DESC_TY]] +// CHECK: llvm.store %[[ARG_DESC5]], %[[ARG_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_single_result(%[[RESUT_PTR]], %[[ARG_PTR]]) + +// Load and return the result. +// CHECK: %[[RESULT:.*]] = llvm.load %[[RESUT_PTR]] +// CHECK: llvm.return %[[RESULT]] + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_single_result +// CHECK: (!llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>>, !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>>) + + +func private @external_multiple_result(%arg0 : memref) + -> (memref, memref, i64, f32) + attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_multiple_result +// CHECK-SAME: %[[ALLOC:.*]]: !llvm.ptr, %[[ALIGN:.*]]: !llvm.ptr, %[[OFFSET:.*]]: i64, %[[SIZE0:.*]]: i64, %[[SIZE1:.*]]: i64, %[[STRIDE0:.*]]: i64, %[[STRIDE1:.*]]: i64 + +// Allocate result on stack. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[RESULT_PTR:.*]] = llvm.alloca %[[C1]] x [[RESULT_DESC_TY:!llvm.struct<\(struct<\(ptr, ptr, i64, array<2 x i64>, array<2 x i64>\)>, struct<\(ptr, ptr, i64, array<1 x i64>, array<1 x i64>\)>, i64, f32\)>]] + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef : [[ARG_DESC_TY:!llvm.struct<\(ptr, ptr, i64, array<2 x i64>, array<2 x i64>\)>]] +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ALLOC]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ALIGN]], %[[ARG_DESC1]][1] +// CHECK: %[[ARG_DESC3:.*]] = llvm.insertvalue %[[OFFSET]], %[[ARG_DESC2]][2] +// CHECK: %[[ARG_DESC4:.*]] = llvm.insertvalue %[[SIZE0]], %[[ARG_DESC3]][3, 0] +// CHECK: %[[ARG_DESC5:.*]] = llvm.insertvalue %[[STRIDE0]], %[[ARG_DESC4]][4, 0] +// CHECK: %[[ARG_DESC6:.*]] = llvm.insertvalue %[[SIZE1]], %[[ARG_DESC5]][3, 1] +// CHECK: %[[ARG_DESC7:.*]] = llvm.insertvalue %[[STRIDE1]], %[[ARG_DESC6]][4, 1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG_DESC_TY]] +// CHECK: llvm.store %[[ARG_DESC7]], %[[ARG_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_multiple_result(%[[RESULT_PTR]], %[[ARG_PTR]]) + +// Load and return the result. +// CHECK: %[[RESULT:.*]] = llvm.load %[[RESULT_PTR]] +// CHECK: llvm.return %[[RESULT]] + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_multiple_result +// CHECK-SAME: (!llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64, f32)>>, !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>) + +func private @external_multiple_args(%arg0 : i64, %arg1 : memref, + %arg2 : memref, %arg3 : f32) attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_multiple_args +// CHECK-SAME: %[[IARG:arg0]]: i64, +// CHECK-SAME: %[[ALLOC0:arg1]]: !llvm.ptr, %[[ALIGN0:arg2]]: !llvm.ptr, %[[OFFSET0:arg3]]: i64, %[[SIZE00:arg4]]: i64, %[[SIZE01:arg5]]: i64, %[[STRIDE00:arg6]]: i64, %[[STRIDE01:arg7]]: i64, +// CHECK-SAME: %[[ALLOC1:arg8]]: !llvm.ptr, %[[ALIGN1:arg9]]: !llvm.ptr, %[[OFFSET1:arg10]]: i64, %[[SIZE10:arg11]]: i64, %[[STRIDE10:arg12]]: i64, +// CHECK-SAME: %[[FARG:arg13]]: f32 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef : [[ARG0_DESC_TY:!llvm.struct<\(ptr, ptr, i64, array<2 x i64>, array<2 x i64>\)>]] +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ALLOC0]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ALIGN0]], %[[ARG0_DESC1]][1] +// CHECK: %[[ARG0_DESC3:.*]] = llvm.insertvalue %[[OFFSET0]], %[[ARG0_DESC2]][2] +// CHECK: %[[ARG0_DESC4:.*]] = llvm.insertvalue %[[SIZE00]], %[[ARG0_DESC3]][3, 0] +// CHECK: %[[ARG0_DESC5:.*]] = llvm.insertvalue %[[STRIDE00]], %[[ARG0_DESC4]][4, 0] +// CHECK: %[[ARG0_DESC6:.*]] = llvm.insertvalue %[[SIZE01]], %[[ARG0_DESC5]][3, 1] +// CHECK: %[[ARG0_DESC7:.*]] = llvm.insertvalue %[[STRIDE01]], %[[ARG0_DESC6]][4, 1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG0_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG0_DESC_TY]] +// CHECK: llvm.store %[[ARG0_DESC7]], %[[ARG0_PTR]] + +// Populate the descriptor for arg1. +// CHECK: %[[ARG1_DESC0:.*]] = llvm.mlir.undef : [[ARG1_DESC_TY:!llvm.struct<\(ptr, ptr, i64, array<1 x i64>, array<1 x i64>\)>]] +// CHECK: %[[ARG1_DESC1:.*]] = llvm.insertvalue %[[ALLOC1]], %[[ARG1_DESC0]][0] +// CHECK: %[[ARG1_DESC2:.*]] = llvm.insertvalue %[[ALIGN1]], %[[ARG1_DESC1]][1] +// CHECK: %[[ARG1_DESC3:.*]] = llvm.insertvalue %[[OFFSET1]], %[[ARG1_DESC2]][2] +// CHECK: %[[ARG1_DESC4:.*]] = llvm.insertvalue %[[SIZE10]], %[[ARG1_DESC3]][3, 0] +// CHECK: %[[ARG1_DESC5:.*]] = llvm.insertvalue %[[STRIDE10]], %[[ARG1_DESC4]][4, 0] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG1_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG1_DESC_TY]] +// CHECK: llvm.store %[[ARG1_DESC5]], %[[ARG1_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_multiple_args(%[[IARG]], %[[ARG0_PTR]], %[[ARG1_PTR]], %[[FARG]]) +// CHECK: llvm.return + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_multiple_args +// CHECK-SAME: (i64, !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>, !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>>, f32) + + +func private @external_no_result_unranked(%arg0 : memref<*xf32>) + attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_no_result_unranked +// CHECK-SAME: %[[RANK:.*]]: i64, %[[INNER_DESC:.*]]: !llvm.ptr + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef : [[ARG_DESC_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[INNER_DESC]], %[[ARG_DESC1]][1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG_DESC_TY]] +// CHECK: llvm.store %[[ARG_DESC2]], %[[ARG_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_no_result_unranked(%[[ARG_PTR]]) +// CHECK: llvm.return + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_no_result_unranked +// CHECK-SAME: (!llvm.ptr)>>) + + +func private @external_single_result_unranked(%arg0 : memref<*xf32>) + -> memref<*xf32> attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_single_result_unranked +// CHECK-SAME: %[[RESULT_INNER_DESC_BUFFER:.*]]: !llvm.ptr, %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Allocate result on stack and populate buffer for inner descriptor. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[RESULT_PTR:.*]] = llvm.alloca %[[C1]] x [[RESULT_DESC_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[RESULT0:.*]] = llvm.mlir.undef : [[RESULT_DESC_TY]] +// CHECK: %[[RESULT1:.*]] = llvm.insertvalue %[[RESULT_INNER_DESC_BUFFER]], %[[RESULT0]][1] +// CHECK: llvm.store %[[RESULT1]], %[[RESULT_PTR]] + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef : [[ARG_DESC_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG_DESC_TY]] +// CHECK: llvm.store %[[ARG_DESC2]], %[[ARG_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_single_result_unranked(%[[RESULT_PTR]], %[[ARG_PTR]]) + +// Load and return the result. +// CHECK: %[[RESULT:.*]] = llvm.load %[[RESULT_PTR]] +// CHECK: llvm.return %[[RESULT]] + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_single_result_unranked +// CHECK-SAME: (!llvm.ptr)>>, !llvm.ptr)>>) + + +func private @external_multiple_result_unranked(%arg0 : memref<*xf32>) + -> (f32, i64, memref<*xf32>, memref<*xf32>) + attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_multiple_result_unranked +// CHECK-SAME: %[[RESULT_INNER_DESC_BUFFER0:.*]]: !llvm.ptr, %[[RESULT_INNER_DESC_BUFFER1:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Allocate result on stack and populate buffers for inner descriptors. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[RESULT_PTR:.*]] = llvm.alloca %[[C1]] x [[RESULT_TY:!llvm.struct<\(f32, i64, struct<\(i64, ptr\)>, struct<\(i64, ptr\)>\)>]] +// CHECK: %[[RESULT0:.*]] = llvm.mlir.undef : [[RESULT_TY]] +// CHECK: %[[RESULT_DESC00:.*]] = llvm.mlir.undef : [[RESULT_DESC0_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[RESULT_DESC01:.*]] = llvm.insertvalue %[[RESULT_INNER_DESC_BUFFER0]], %[[RESULT_DESC00]][1] +// CHECK: %[[RESULT1:.*]] = llvm.insertvalue %[[RESULT_DESC01]], %[[RESULT0]][2] +// CHECK: %[[RESULT_DESC10:.*]] = llvm.mlir.undef : [[RESULT_DESC1_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[RESULT_DESC11:.*]] = llvm.insertvalue %[[RESULT_INNER_DESC_BUFFER1]], %[[RESULT_DESC10]][1] +// CHECK: %[[RESULT2:.*]] = llvm.insertvalue %[[RESULT_DESC11]], %[[RESULT1]][3] +// CHECK: llvm.store %[[RESULT2]], %[[RESULT_PTR]] + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef : [[ARG_DESC_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG_DESC_TY]] +// CHECK: llvm.store %[[ARG_DESC2]], %[[ARG_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_multiple_result_unranked(%[[RESULT_PTR]], %[[ARG_PTR]]) + +// Load and return the result. +// CHECK: %[[RESULT:.*]] = llvm.load %[[RESULT_PTR]] +// CHECK: llvm.return %[[RESULT]] + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_multiple_result_unranked +// CHECK-SAME: (!llvm.ptr)>, struct<(i64, ptr)>)>>, !llvm.ptr)>>) + + +func private @external_multiple_args_unranked(%arg0 : memref<*xf32>, + %arg1 : f32, %arg2 : memref<*xf32>, %arg3 : i64) + attributes { llvm.emit_c_interface } + +// CHECK-LABEL: llvm.func @external_multiple_args_unranked +// CHECK-SAME: %[[ARG0_RANK:.*]]: i64, %[[ARG0_INNER_DESC:arg1]]: !llvm.ptr, +// CHECK-SAME: %[[FARG:arg2]]: f32, +// CHECK-SAME: %[[ARG2_RANK:.*]]: i64, %[[ARG2_INNER_DESC:arg4]]: !llvm.ptr, +// CHECK-SAME: %[[IARG:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef : [[ARG0_DESC_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ARG0_RANK]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ARG0_INNER_DESC]], %[[ARG0_DESC1]][1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG0_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG0_DESC_TY]] +// CHECK: llvm.store %[[ARG0_DESC2]], %[[ARG0_PTR]] + +// Populate the descriptor for arg2. +// CHECK: %[[ARG2_DESC0:.*]] = llvm.mlir.undef : [[ARG2_DESC_TY:!llvm.struct<\(i64, ptr\)>]] +// CHECK: %[[ARG2_DESC1:.*]] = llvm.insertvalue %[[ARG2_RANK]], %[[ARG2_DESC0]][0] +// CHECK: %[[ARG2_DESC2:.*]] = llvm.insertvalue %[[ARG2_INNER_DESC]], %[[ARG2_DESC1]][1] + +// Allocate on stack and store to comply with C calling convention. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[ARG2_PTR:.*]] = llvm.alloca %[[C1]] x [[ARG2_DESC_TY]] +// CHECK: llvm.store %[[ARG2_DESC2]], %[[ARG2_PTR]] + +// Call the interface function. +// CHECK: llvm.call @_mlir_ciface_external_multiple_args_unranked(%[[ARG0_PTR]], %[[FARG]], %[[ARG2_PTR]], %[[IARG]]) +// CHECK: llvm.return + +// Verify that an interface function is emitted. +// CHECK-LABEL: llvm.func @_mlir_ciface_external_multiple_args_unranked +// CHECK-SAME: (!llvm.ptr)>>, f32, !llvm.ptr)>>, i64) diff --git a/mlir/test/Conversion/StandardToLLVM/calling-convention-external-c-function-caller.mlir b/mlir/test/Conversion/StandardToLLVM/calling-convention-external-c-function-caller.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/StandardToLLVM/calling-convention-external-c-function-caller.mlir @@ -0,0 +1,496 @@ +// RUN: mlir-opt %s \ +// RUN: --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm='max-unranked-desc-buffer-rank=5' | FileCheck %s + +func @callee_no_result(%arg0 : memref) + attributes { llvm.emit_c_interface } { + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = memref.load %arg0[%c0, %c1] : memref + return +} + +// CHECK-LABEL: llvm.func @callee_no_result +// CHECK-SAME: %[[ALLOC:.*]]: !llvm.ptr, %[[ALIGN:.*]]: !llvm.ptr, %[[OFFSET:.*]]: i64, %[[SIZE0:.*]]: i64, %[[SIZE1:.*]]: i64, %[[STRIDE0:.*]]: i64, %[[STRIDE1:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ALLOC]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ALIGN]], %[[ARG_DESC1]][1] +// CHECK: %[[ARG_DESC3:.*]] = llvm.insertvalue %[[OFFSET]], %[[ARG_DESC2]][2] +// CHECK: %[[ARG_DESC4:.*]] = llvm.insertvalue %[[SIZE0]], %[[ARG_DESC3]][3, 0] +// CHECK: %[[ARG_DESC5:.*]] = llvm.insertvalue %[[STRIDE0]], %[[ARG_DESC4]][4, 0] +// CHECK: %[[ARG_DESC6:.*]] = llvm.insertvalue %[[SIZE1]], %[[ARG_DESC5]][3, 1] +// CHECK: %[[ARG_DESC7:.*]] = llvm.insertvalue %[[STRIDE1]], %[[ARG_DESC6]][4, 1] + +// CHECK: %{{.*}} = llvm.load %{{.*}} +// CHECK: llvm.return + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_no_result +// CHECK-SAME: %[[ARG_PTR:.*]]: !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>> + +// Unpack descriptor for arg0. +// CHECK: %[[ARG_DESC:.*]] = llvm.load %[[ARG_PTR]] +// CHECK: %[[ALLOC:.*]] = llvm.extractvalue %[[ARG_DESC]][0] +// CHECK: %[[ALIGN:.*]] = llvm.extractvalue %[[ARG_DESC]][1] +// CHECK: %[[OFFSET:.*]] = llvm.extractvalue %[[ARG_DESC]][2] +// CHECK: %[[SIZE0:.*]] = llvm.extractvalue %[[ARG_DESC]][3, 0] +// CHECK: %[[SIZE1:.*]] = llvm.extractvalue %[[ARG_DESC]][3, 1] +// CHECK: %[[STRIDE0:.*]] = llvm.extractvalue %[[ARG_DESC]][4, 0] +// CHECK: %[[STRIDE1:.*]] = llvm.extractvalue %[[ARG_DESC]][4, 1] + +// Call the function. +// CHECK: llvm.call @callee_no_result(%[[ALLOC]], %[[ALIGN]], %[[OFFSET]], %[[SIZE0]], %[[SIZE1]], %[[STRIDE0]], %[[STRIDE1]]) +// CHECK: llvm.return + + +func @callee_single_result(%arg0 : memref) -> memref + attributes { llvm.emit_c_interface } { + return %arg0 : memref +} + +// CHECK-LABEL: llvm.func @callee_single_result +// CHECK-SAME: %[[ALLOC:.*]]: !llvm.ptr, %[[ALIGN:.*]]: !llvm.ptr, %[[OFFSET:.*]]: i64, %[[SIZE0:.*]]: i64, %[[STRIDE0:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ALLOC]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ALIGN]], %[[ARG_DESC1]][1] +// CHECK: %[[ARG_DESC3:.*]] = llvm.insertvalue %[[OFFSET]], %[[ARG_DESC2]][2] +// CHECK: %[[ARG_DESC4:.*]] = llvm.insertvalue %[[SIZE0]], %[[ARG_DESC3]][3, 0] +// CHECK: %[[ARG_DESC5:.*]] = llvm.insertvalue %[[STRIDE0]], %[[ARG_DESC4]][4, 0] + +// CHECK: llvm.return %[[ARG_DESC5]] + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_single_result +// CHECK-SAME: %[[RESULT_PTR:.*]]: !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>>, +// CHECK-SAME: %[[ARG_PTR:.*]]: !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>> + +// Unpack descriptor for arg0. +// CHECK: %[[ARG_DESC:.*]] = llvm.load %[[ARG_PTR]] +// CHECK: %[[ALLOC:.*]] = llvm.extractvalue %[[ARG_DESC]][0] +// CHECK: %[[ALIGN:.*]] = llvm.extractvalue %[[ARG_DESC]][1] +// CHECK: %[[OFFSET:.*]] = llvm.extractvalue %[[ARG_DESC]][2] +// CHECK: %[[SIZE0:.*]] = llvm.extractvalue %[[ARG_DESC]][3, 0] +// CHECK: %[[STRIDE0:.*]] = llvm.extractvalue %[[ARG_DESC]][4, 0] + +// Call the function. +// CHECK: %[[RESULT:.*]] = llvm.call @callee_single_result(%[[ALLOC]], %[[ALIGN]], %[[OFFSET]], %[[SIZE0]], %[[STRIDE0]]) + +// Store the result and return. +// CHECK: llvm.store %[[RESULT]], %[[RESULT_PTR]] +// CHECK: llvm.return + + +func @callee_multiple_result(%arg0 : memref, + %arg1 : memref) -> (memref, memref, i64, f32) + attributes { llvm.emit_c_interface } { + %c3 = constant 3 : i64 + %pi = constant 3.141 : f32 + return %arg0, %arg1, %c3, %pi : memref, memref, i64, f32 +} + +// CHECK-LABEL: llvm.func @callee_multiple_result +// CHECK-SAME: %[[ALLOC0:.*]]: !llvm.ptr, %[[ALIGN0:.*]]: !llvm.ptr, %[[OFFSET0:.*]]: i64, %[[SIZE00:.*]]: i64, %[[SIZE01:.*]]: i64, %[[STRIDE00:.*]]: i64, %[[STRIDE01:arg6]]: i64, +// CHECK-SAME: %[[ALLOC1:.*]]: !llvm.ptr, %[[ALIGN1:.*]]: !llvm.ptr, %[[OFFSET1:.*]]: i64, %[[SIZE10:.*]]: i64, %[[STRIDE10:arg11]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ALLOC0]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ALIGN0]], %[[ARG0_DESC1]][1] +// CHECK: %[[ARG0_DESC3:.*]] = llvm.insertvalue %[[OFFSET0]], %[[ARG0_DESC2]][2] +// CHECK: %[[ARG0_DESC4:.*]] = llvm.insertvalue %[[SIZE00]], %[[ARG0_DESC3]][3, 0] +// CHECK: %[[ARG0_DESC5:.*]] = llvm.insertvalue %[[STRIDE00]], %[[ARG0_DESC4]][4, 0] +// CHECK: %[[ARG0_DESC6:.*]] = llvm.insertvalue %[[SIZE01]], %[[ARG0_DESC5]][3, 1] +// CHECK: %[[ARG0_DESC7:.*]] = llvm.insertvalue %[[STRIDE01]], %[[ARG0_DESC6]][4, 1] + +// Populate the descriptor for arg1. +// CHECK: %[[ARG1_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG1_DESC1:.*]] = llvm.insertvalue %[[ALLOC1]], %[[ARG1_DESC0]][0] +// CHECK: %[[ARG1_DESC2:.*]] = llvm.insertvalue %[[ALIGN1]], %[[ARG1_DESC1]][1] +// CHECK: %[[ARG1_DESC3:.*]] = llvm.insertvalue %[[OFFSET1]], %[[ARG1_DESC2]][2] +// CHECK: %[[ARG1_DESC4:.*]] = llvm.insertvalue %[[SIZE10]], %[[ARG1_DESC3]][3, 0] +// CHECK: %[[ARG1_DESC5:.*]] = llvm.insertvalue %[[STRIDE10]], %[[ARG1_DESC4]][4, 0] + +// Populate and return result. +// CHECK: %[[RESULT0:.*]] = llvm.mlir.undef +// CHECK: %[[RESULT1:.*]] = llvm.insertvalue %[[ARG0_DESC7]], %[[RESULT0]][0] +// CHECK: %[[RESULT2:.*]] = llvm.insertvalue %[[ARG1_DESC5]], %[[RESULT1]][1] +// CHECK: %[[RESULT3:.*]] = llvm.insertvalue %{{.*}}, %[[RESULT2]][2] +// CHECK: %[[RESULT4:.*]] = llvm.insertvalue %{{.*}}, %[[RESULT3]][3] +// CHECK: llvm.return %[[RESULT4]] + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_multiple_result +// CHECK-SAME: %[[RESULT_PTR:.*]]: !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64, f32)>>, +// CHECK-SAME: %[[ARG0_PTR:.*]]: !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>, +// CHECK-SAME: %[[ARG1_PTR:.*]]: !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>> + +// Unpack descriptor for arg0. +// CHECK: %[[ARG0_DESC:.*]] = llvm.load %[[ARG0_PTR]] +// CHECK: %[[ALLOC0:.*]] = llvm.extractvalue %[[ARG0_DESC]][0] +// CHECK: %[[ALIGN0:.*]] = llvm.extractvalue %[[ARG0_DESC]][1] +// CHECK: %[[OFFSET0:.*]] = llvm.extractvalue %[[ARG0_DESC]][2] +// CHECK: %[[SIZE00:.*]] = llvm.extractvalue %[[ARG0_DESC]][3, 0] +// CHECK: %[[SIZE01:.*]] = llvm.extractvalue %[[ARG0_DESC]][3, 1] +// CHECK: %[[STRIDE00:.*]] = llvm.extractvalue %[[ARG0_DESC]][4, 0] +// CHECK: %[[STRIDE01:.*]] = llvm.extractvalue %[[ARG0_DESC]][4, 1] + +// Unpack descriptor for arg1. +// CHECK: %[[ARG1_DESC:.*]] = llvm.load %[[ARG1_PTR]] +// CHECK: %[[ALLOC1:.*]] = llvm.extractvalue %[[ARG1_DESC]][0] +// CHECK: %[[ALIGN1:.*]] = llvm.extractvalue %[[ARG1_DESC]][1] +// CHECK: %[[OFFSET1:.*]] = llvm.extractvalue %[[ARG1_DESC]][2] +// CHECK: %[[SIZE10:.*]] = llvm.extractvalue %[[ARG1_DESC]][3, 0] +// CHECK: %[[STRIDE10:.*]] = llvm.extractvalue %[[ARG1_DESC]][4, 0] + +// Call the function. +// CHECK: %[[RESULT:.*]] = llvm.call @callee_multiple_result(%[[ALLOC0]], %[[ALIGN0]], %[[OFFSET0]], %[[SIZE00]], %[[SIZE01]], %[[STRIDE00]], %[[STRIDE01]], %[[ALLOC1]], %[[ALIGN1]], %[[OFFSET1]], %[[SIZE10]], %[[STRIDE10]]) + +// Store the result and return. +// CHECK: llvm.store %[[RESULT]], %[[RESULT_PTR]] +// CHECK: llvm.return + + +func @callee_multiple_args(%arg0 : index, %arg1 : memref, + %arg2 : memref, %arg3 : f32) attributes { llvm.emit_c_interface } { + %c0 = constant 0 : index + %0 = memref.load %arg1[%c0, %arg0] : memref + %1 = memref.load %arg2[%arg0] : memref + return +} + +// CHECK-LABEL: llvm.func @callee_multiple_args +// CHECK-SAME: %[[IARG:arg0]]: i64, +// CHECK-SAME: %[[ALLOC0:.*]]: !llvm.ptr, %[[ALIGN0:.*]]: !llvm.ptr, %[[OFFSET0:.*]]: i64, %[[SIZE00:.*]]: i64, %[[SIZE01:.*]]: i64, %[[STRIDE00:.*]]: i64, %[[STRIDE01:arg7]]: i64, +// CHECK-SAME: %[[ALLOC1:.*]]: !llvm.ptr, %[[ALIGN1:.*]]: !llvm.ptr, %[[OFFSET1:.*]]: i64, %[[SIZE10:.*]]: i64, %[[STRIDE10:arg12]]: i64, +// CHECK-SAME: %[[FARG:.*]]: f32 + +// Populate the descriptor for arg1. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ALLOC0]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ALIGN0]], %[[ARG0_DESC1]][1] +// CHECK: %[[ARG0_DESC3:.*]] = llvm.insertvalue %[[OFFSET0]], %[[ARG0_DESC2]][2] +// CHECK: %[[ARG0_DESC4:.*]] = llvm.insertvalue %[[SIZE00]], %[[ARG0_DESC3]][3, 0] +// CHECK: %[[ARG0_DESC5:.*]] = llvm.insertvalue %[[STRIDE00]], %[[ARG0_DESC4]][4, 0] +// CHECK: %[[ARG0_DESC6:.*]] = llvm.insertvalue %[[SIZE01]], %[[ARG0_DESC5]][3, 1] +// CHECK: %[[ARG0_DESC7:.*]] = llvm.insertvalue %[[STRIDE01]], %[[ARG0_DESC6]][4, 1] + +// Populate the descriptor for arg2. +// CHECK: %[[ARG1_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG1_DESC1:.*]] = llvm.insertvalue %[[ALLOC1]], %[[ARG1_DESC0]][0] +// CHECK: %[[ARG1_DESC2:.*]] = llvm.insertvalue %[[ALIGN1]], %[[ARG1_DESC1]][1] +// CHECK: %[[ARG1_DESC3:.*]] = llvm.insertvalue %[[OFFSET1]], %[[ARG1_DESC2]][2] +// CHECK: %[[ARG1_DESC4:.*]] = llvm.insertvalue %[[SIZE10]], %[[ARG1_DESC3]][3, 0] +// CHECK: %[[ARG1_DESC5:.*]] = llvm.insertvalue %[[STRIDE10]], %[[ARG1_DESC4]][4, 0] + +// CHECK: %{{.*}} = llvm.load %{{.*}} +// CHECK: %{{.*}} = llvm.load %{{.*}} +// CHECK: llvm.return + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_multiple_args +// CHECK-SAME: %[[IARG:arg0]]: i64, +// CHECK-SAME: %[[ARG1_PTR:.*]]: !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>, +// CHECK-SAME: %[[ARG2_PTR:.*]]: !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>>, +// CHECK-SAME: %[[FARG:.*]]: f32 + +// Unpack descriptor for arg1. +// CHECK: %[[ARG1_DESC:.*]] = llvm.load %[[ARG1_PTR]] +// CHECK: %[[ALLOC0:.*]] = llvm.extractvalue %[[ARG1_DESC]][0] +// CHECK: %[[ALIGN0:.*]] = llvm.extractvalue %[[ARG1_DESC]][1] +// CHECK: %[[OFFSET0:.*]] = llvm.extractvalue %[[ARG1_DESC]][2] +// CHECK: %[[SIZE00:.*]] = llvm.extractvalue %[[ARG1_DESC]][3, 0] +// CHECK: %[[SIZE01:.*]] = llvm.extractvalue %[[ARG1_DESC]][3, 1] +// CHECK: %[[STRIDE00:.*]] = llvm.extractvalue %[[ARG1_DESC]][4, 0] +// CHECK: %[[STRIDE01:.*]] = llvm.extractvalue %[[ARG1_DESC]][4, 1] + +// Unpack descriptor for arg2. +// CHECK: %[[ARG2_DESC:.*]] = llvm.load %[[ARG2_PTR]] +// CHECK: %[[ALLOC1:.*]] = llvm.extractvalue %[[ARG2_DESC]][0] +// CHECK: %[[ALIGN1:.*]] = llvm.extractvalue %[[ARG2_DESC]][1] +// CHECK: %[[OFFSET1:.*]] = llvm.extractvalue %[[ARG2_DESC]][2] +// CHECK: %[[SIZE10:.*]] = llvm.extractvalue %[[ARG2_DESC]][3, 0] +// CHECK: %[[STRIDE10:.*]] = llvm.extractvalue %[[ARG2_DESC]][4, 0] + +// Call the function. +// CHECK: llvm.call @callee_multiple_args(%[[IARG]], %[[ALLOC0]], %[[ALIGN0]], %[[OFFSET0]], %[[SIZE00]], %[[SIZE01]], %[[STRIDE00]], %[[STRIDE01]], %[[ALLOC1]], %[[ALIGN1]], %[[OFFSET1]], %[[SIZE10]], %[[STRIDE10]], %[[FARG]]) +// CHECK: llvm.return + + +func @callee_no_result_unranked(%arg0 : memref<*xf32>) + attributes { llvm.emit_c_interface } { + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = memref.cast %arg0 : memref<*xf32> to memref + %1 = memref.load %0[%c0, %c1] : memref + return +} + +// CHECK-LABEL: llvm.func @callee_no_result_unranked +// CHECK-SAME: %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// CHECK: %{{.*}} = llvm.load %{{.*}} +// CHECK: llvm.return + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_no_result_unranked +// CHECK-SAME: %[[ARG_PTR:.*]]: !llvm.ptr)>> + +// Unpack descriptor for arg0. +// CHECK: %[[ARG_DESC:.*]] = llvm.load %[[ARG_PTR]] +// CHECK: %[[ARG_RANK:.*]] = llvm.extractvalue %[[ARG_DESC]][0] +// CHECK: %[[ARG_INNER_DESC:.*]] = llvm.extractvalue %[[ARG_DESC]][1] + +// Call the function. +// CHECK: llvm.call @callee_no_result_unranked(%[[ARG_RANK]], %[[ARG_INNER_DESC]]) +// CHECK: llvm.return + + +func @callee_single_result_unranked(%arg0 : memref<*xf32>) -> memref<*xf32> + attributes { llvm.emit_c_interface } { + return %arg0 : memref<*xf32> +} + +// CHECK-LABEL: llvm.func @callee_single_result_unranked +// CHECK-SAME: %[[RESULT_INNER_DESC_BUFFER:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// Common constant. +// CHECK: %[[MAX_SUPPORTED_RANK:.*]] = llvm.mlir.constant(5 : i64) + +// Compute the result's inner descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[SIZE:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Check if the inner descriptor fits into the buffer argument. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK]] +// CHECK: llvm.cond_br %[[PRED]], ^bb2, ^bb3 + +// Copy the inner descriptor to the selected buffer and return a copy of the +// unranked outer descriptor. +// CHECK: ^bb1(%[[SELECTED_BUFFER:.*]]: !llvm.ptr): +// CHECK: %[[ARG_INNER_DESC:.*]] = llvm.extractvalue %[[ARG_DESC2]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[SELECTED_BUFFER]], %[[ARG_INNER_DESC]], %[[SIZE]], %[[C0]]) +// CHECK: %[[RESULT_DESC0:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> +// CHECK: %[[RESULT_DESC1:.*]] = llvm.insertvalue %[[RANK]], %[[RESULT_DESC0]][0] +// CHECK: %[[RESULT_DESC2:.*]] = llvm.insertvalue %[[SELECTED_BUFFER]], %[[RESULT_DESC1]][1] +// CHECK: llvm.return %[[RESULT_DESC2]] + +// Select the buffer argument to copy the inner descriptor to. +// CHECK: ^bb2: +// CHECK: llvm.br ^bb1(%[[RESULT_INNER_DESC_BUFFER]] : !llvm.ptr) + +// Allocate a new buffer to copy the inner descriptor to. +// CHECK: ^bb3: +// CHECK: %[[NEW_BUFFER:.*]] = llvm.call @malloc(%[[SIZE]]) +// CHECK: llvm.br ^bb1(%[[NEW_BUFFER]] : !llvm.ptr) + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_single_result_unranked +// CHECK-SAME: %[[RESULT_PTR:.*]]: !llvm.ptr)>>, +// CHECK-SAME: %[[ARG_PTR:.*]]: !llvm.ptr)>> + +// Extract inner descriptor buffer from pre-allocated result. +// CHECK: %[[RESULT:.*]] = llvm.load %[[RESULT_PTR]] +// CHECK: %[[RESULT_INNER_DESC_BUFFER:.*]] = llvm.extractvalue %[[RESULT]][1] + +// Unpack descriptor for arg0. +// CHECK: %[[ARG_DESC:.*]] = llvm.load %[[ARG_PTR]] +// CHECK: %[[ARG_RANK:.*]] = llvm.extractvalue %[[ARG_DESC]][0] +// CHECK: %[[ARG_INNER_DESC:.*]] = llvm.extractvalue %[[ARG_DESC]][1] + +// Call the function. +// CHECK: %[[RESULT:.*]] = llvm.call @callee_single_result_unranked(%[[RESULT_INNER_DESC_BUFFER]], %[[ARG_RANK]], %[[ARG_INNER_DESC]]) + +// Store the result and return. +// CHECK: llvm.store %[[RESULT]], %[[RESULT_PTR]] +// CHECK: llvm.return + + +func @callee_multiple_result_unranked(%arg0 : memref<*xf32>) -> (f32, i64, + memref<*xf32>, memref<*xf32>) attributes { llvm.emit_c_interface } { + %pi = constant 3.141 : f32 + %c3 = constant 3 : i64 + return %pi, %c3, %arg0, %arg0 : f32, i64, memref<*xf32>, memref<*xf32> +} + +// CHECK-LABEL: llvm.func @callee_multiple_result_unranked +// CHECK-SAME: %[[RESULT_INNER_DESC_BUFFER0:.*]]: !llvm.ptr, %[[RESULT_INNER_DESC_BUFFER1:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// Common constant. +// CHECK: %[[MAX_SUPPORTED_RANK:.*]] = llvm.mlir.constant(5 : i64) + +// Compute first result's inner descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[SIZE0:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Check if the inner descriptor fits into the buffer argument. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK]] +// CHECK: llvm.cond_br %[[PRED]], ^bb3, ^bb4 + +// Copy the inner descriptor to the selected buffer and create a copy of the +// unranked outer descriptor. +// CHECK: ^bb1(%[[SELECTED_BUFFER:.*]]: !llvm.ptr): +// CHECK: %[[ARG_INNER_DESC:.*]] = llvm.extractvalue %[[ARG_DESC2]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[SELECTED_BUFFER]], %[[ARG_INNER_DESC]], %[[SIZE0]], %[[C0]]) +// CHECK: %[[RESULT0_DESC0:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> +// CHECK: %[[RESULT0_DESC1:.*]] = llvm.insertvalue %[[RANK]], %[[RESULT0_DESC0]][0] +// CHECK: %[[RESULT0_DESC2:.*]] = llvm.insertvalue %[[SELECTED_BUFFER]], %[[RESULT0_DESC1]][1] + +// Compute second result's inner descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[SIZE1:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Check if the inner descriptor fits into the buffer argument. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK]] +// CHECK: llvm.cond_br %[[PRED]], ^bb5, ^bb6 + +// Copy the inner descriptor to the selected buffer and create a copy of the +// unranked outer descriptor. +// CHECK: ^bb2(%[[SELECTED_BUFFER:.*]]: !llvm.ptr): +// CHECK: %[[ARG_INNER_DESC:.*]] = llvm.extractvalue %[[ARG_DESC2]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[SELECTED_BUFFER]], %[[ARG_INNER_DESC]], %[[SIZE1]], %[[C0]]) +// CHECK: %[[RESULT1_DESC0:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> +// CHECK: %[[RESULT1_DESC1:.*]] = llvm.insertvalue %[[RANK]], %[[RESULT1_DESC0]][0] +// CHECK: %[[RESULT1_DESC2:.*]] = llvm.insertvalue %[[SELECTED_BUFFER]], %[[RESULT1_DESC1]][1] + +// Populate and return result. +// CHECK: %[[RESULT0:.*]] = llvm.mlir.undef +// CHECK: %[[RESULT1:.*]] = llvm.insertvalue %{{.*}}, %[[RESULT0]][0] +// CHECK: %[[RESULT2:.*]] = llvm.insertvalue %{{.*}}, %[[RESULT1]][1] +// CHECK: %[[RESULT3:.*]] = llvm.insertvalue %[[RESULT0_DESC2]], %[[RESULT2]][2] +// CHECK: %[[RESULT4:.*]] = llvm.insertvalue %[[RESULT1_DESC2]], %[[RESULT3]][3] +// CHECK: llvm.return %[[RESULT4]] + +// Select the buffer argument to copy the inner descriptor to (first result). +// CHECK: ^bb3: +// CHECK: llvm.br ^bb1(%[[RESULT_INNER_DESC_BUFFER0]] : !llvm.ptr) + +// Allocate a new buffer to copy the inner descriptor to (first result). +// CHECK: ^bb4: +// CHECK: %[[NEW_BUFFER:.*]] = llvm.call @malloc(%[[SIZE0]]) +// CHECK: llvm.br ^bb1(%[[NEW_BUFFER]] : !llvm.ptr) + +// Select the buffer argument to copy the inner descriptor to (second result). +// CHECK: ^bb5: +// CHECK: llvm.br ^bb2(%[[RESULT_INNER_DESC_BUFFER1]] : !llvm.ptr) + +// Allocate a new buffer to copy the inner descriptor to (second result). +// CHECK: ^bb6: +// CHECK: %[[NEW_BUFFER:.*]] = llvm.call @malloc(%[[SIZE1]]) +// CHECK: llvm.br ^bb2(%[[NEW_BUFFER]] : !llvm.ptr) + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_multiple_result_unranked +// CHECK-SAME: %[[RESULT_PTR:.*]]: !llvm.ptr)>, struct<(i64, ptr)>)>>, +// CHECK-SAME: %[[ARG_PTR:.*]]: !llvm.ptr)>> + +// Extract inner descriptor buffers from the pre-allocated result. +// CHECK: %[[RESULT:.*]] = llvm.load %[[RESULT_PTR]] +// CHECK: %[[RESULT_DESC0:.*]] = llvm.extractvalue %[[RESULT]][2] +// CHECK: %[[RESULT_INNER_DESC_BUFFER0:.*]] = llvm.extractvalue %[[RESULT_DESC0]][1] +// CHECK: %[[RESULT_DESC1:.*]] = llvm.extractvalue %[[RESULT]][3] +// CHECK: %[[RESULT_INNER_DESC_BUFFER1:.*]] = llvm.extractvalue %[[RESULT_DESC1]][1] + +// Unpack descriptor for arg0. +// CHECK: %[[ARG_DESC:.*]] = llvm.load %[[ARG_PTR]] +// CHECK: %[[ARG_RANK:.*]] = llvm.extractvalue %[[ARG_DESC]][0] +// CHECK: %[[ARG_INNER_DESC:.*]] = llvm.extractvalue %[[ARG_DESC]][1] + +// Call the function. +// CHECK: %[[RESULT:.*]] = llvm.call @callee_multiple_result_unranked(%[[RESULT_INNER_DESC_BUFFER0]], %[[RESULT_INNER_DESC_BUFFER1]], %[[ARG_RANK]], %[[ARG_INNER_DESC]]) + +// Store the result and return. +// CHECK: llvm.store %[[RESULT]], %[[RESULT_PTR]] +// CHECK: llvm.return + + +func @callee_multiple_args_unranked(%arg0 : memref<*xf32>, %arg1 : f32, + %arg2 : memref<*xf32>, %arg3 : index) attributes { llvm.emit_c_interface } { + %c0 = constant 0 : index + %0 = memref.cast %arg0 : memref<*xf32> to memref + %1 = memref.load %0[%c0, %arg3] : memref + %2 = memref.cast %arg2 : memref<*xf32> to memref + %3 = memref.load %2[%arg3] : memref + return +} + +// CHECK-LABEL: llvm.func @callee_multiple_args_unranked +// CHECK-SAME: %[[ARG0_RANK:.*]]: i64, %[[ARG0_INNER_DESC:arg1]]: !llvm.ptr, +// CHECK-SAME: %[[FARG:arg2]]: f32, +// CHECK-SAME: %[[ARG1_RANK:.*]]: i64, %[[ARG1_INNER_DESC:arg4]]: !llvm.ptr, +// CHECK-SAME: %[[IARG:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ARG0_RANK]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ARG0_INNER_DESC]], %[[ARG0_DESC1]][1] + +// Populate the descriptor for arg1. +// CHECK: %[[ARG1_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG1_DESC1:.*]] = llvm.insertvalue %[[ARG1_RANK]], %[[ARG1_DESC0]][0] +// CHECK: %[[ARG1_DESC2:.*]] = llvm.insertvalue %[[ARG1_INNER_DESC]], %[[ARG1_DESC1]][1] + +// CHECK: llvm.return + +// CHECK-LABEL: llvm.func @_mlir_ciface_callee_multiple_args_unranked +// CHECK-SAME: %[[ARG0_PTR:arg0]]: !llvm.ptr)>>, +// CHECK-SAME: %[[FARG:arg1]]: f32, +// CHECK-SAME: %[[ARG1_PTR:arg2]]: !llvm.ptr)>>, +// CHECK-SAME: %[[IARG:arg3]]: i64 + +// Unpack descriptor for arg0. +// CHECK: %[[ARG0_DESC:.*]] = llvm.load %[[ARG0_PTR]] +// CHECK: %[[ARG0_RANK:.*]] = llvm.extractvalue %[[ARG0_DESC]][0] +// CHECK: %[[ARG0_INNER_DESC:.*]] = llvm.extractvalue %[[ARG0_DESC]][1] + +// Unpack descriptor for arg1. +// CHECK: %[[ARG1_DESC:.*]] = llvm.load %[[ARG1_PTR]] +// CHECK: %[[ARG1_RANK:.*]] = llvm.extractvalue %[[ARG1_DESC]][0] +// CHECK: %[[ARG1_INNER_DESC:.*]] = llvm.extractvalue %[[ARG1_DESC]][1] + +// Call the function. +// CHECK: llvm.call @callee_multiple_args_unranked(%[[ARG0_RANK]], %[[ARG0_INNER_DESC]], %[[FARG]], %[[ARG1_RANK]], %[[ARG1_INNER_DESC]], %[[IARG]]) +// CHECK: llvm.return diff --git a/mlir/test/Conversion/StandardToLLVM/calling-convention.mlir b/mlir/test/Conversion/StandardToLLVM/calling-convention.mlir --- a/mlir/test/Conversion/StandardToLLVM/calling-convention.mlir +++ b/mlir/test/Conversion/StandardToLLVM/calling-convention.mlir @@ -1,251 +1,607 @@ -// RUN: mlir-opt -convert-memref-to-llvm -convert-std-to-llvm='emit-c-wrappers=1' -reconcile-unrealized-casts %s | FileCheck %s -// RUN: mlir-opt -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=EMIT_C_ATTRIBUTE - -// This tests the default memref calling convention and the emission of C -// wrappers. We don't need to separate runs because the wrapper-emission -// version subsumes the calling convention and only adds new functions, that we -// can also file-check in the same run. - -// An external function is transformed into the glue around calling an interface function. -// CHECK-LABEL: @external -// CHECK: %[[ALLOC0:.*]]: !llvm.ptr, %[[ALIGN0:.*]]: !llvm.ptr, %[[OFFSET0:.*]]: i64, %[[SIZE00:.*]]: i64, %[[SIZE01:.*]]: i64, %[[STRIDE00:.*]]: i64, %[[STRIDE01:.*]]: i64, -// CHECK: %[[ALLOC1:.*]]: !llvm.ptr, %[[ALIGN1:.*]]: !llvm.ptr, %[[OFFSET1:.*]]: i64) -func private @external(%arg0: memref, %arg1: memref) - // Populate the descriptor for arg0. - // CHECK: %[[DESC00:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - // CHECK: %[[DESC01:.*]] = llvm.insertvalue %arg0, %[[DESC00]][0] - // CHECK: %[[DESC02:.*]] = llvm.insertvalue %arg1, %[[DESC01]][1] - // CHECK: %[[DESC03:.*]] = llvm.insertvalue %arg2, %[[DESC02]][2] - // CHECK: %[[DESC04:.*]] = llvm.insertvalue %arg3, %[[DESC03]][3, 0] - // CHECK: %[[DESC05:.*]] = llvm.insertvalue %arg5, %[[DESC04]][4, 0] - // CHECK: %[[DESC06:.*]] = llvm.insertvalue %arg4, %[[DESC05]][3, 1] - // CHECK: %[[DESC07:.*]] = llvm.insertvalue %arg6, %[[DESC06]][4, 1] - - // Allocate on stack and store to comply with C calling convention. - // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) - // CHECK: %[[DESC0_ALLOCA:.*]] = llvm.alloca %[[C1]] x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - // CHECK: llvm.store %[[DESC07]], %[[DESC0_ALLOCA]] - - // Populate the descriptor for arg1. - // CHECK: %[[DESC10:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> - // CHECK: %[[DESC11:.*]] = llvm.insertvalue %arg7, %[[DESC10]][0] : !llvm.struct<(ptr, ptr, i64)> - // CHECK: %[[DESC12:.*]] = llvm.insertvalue %arg8, %[[DESC11]][1] : !llvm.struct<(ptr, ptr, i64)> - // CHECK: %[[DESC13:.*]] = llvm.insertvalue %arg9, %[[DESC12]][2] : !llvm.struct<(ptr, ptr, i64)> - - // Allocate on stack and store to comply with C calling convention. - // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) - // CHECK: %[[DESC1_ALLOCA:.*]] = llvm.alloca %[[C1]] x !llvm.struct<(ptr, ptr, i64)> - // CHECK: llvm.store %[[DESC13]], %[[DESC1_ALLOCA]] - - // Call the interface function. - // CHECK: llvm.call @_mlir_ciface_external - -// Verify that an interface function is emitted. -// CHECK-LABEL: llvm.func @_mlir_ciface_external -// CHECK: (!llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>, !llvm.ptr, ptr, i64)>>) - -// Verify that the return value is not affected. -// CHECK-LABEL: @returner -// CHECK: -> !llvm.struct<(struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, struct<(ptr, ptr, i64)>)> -func private @returner() -> (memref, memref) - -// CHECK-LABEL: @caller -func @caller() { - %0:2 = call @returner() : () -> (memref, memref) - // Extract individual values from the descriptor for the first memref. - // CHECK: %[[ALLOC0:.*]] = llvm.extractvalue %[[DESC0:.*]][0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - // CHECK: %[[ALIGN0:.*]] = llvm.extractvalue %[[DESC0]][1] - // CHECK: %[[OFFSET0:.*]] = llvm.extractvalue %[[DESC0]][2] - // CHECK: %[[SIZE00:.*]] = llvm.extractvalue %[[DESC0]][3, 0] - // CHECK: %[[SIZE01:.*]] = llvm.extractvalue %[[DESC0]][3, 1] - // CHECK: %[[STRIDE00:.*]] = llvm.extractvalue %[[DESC0]][4, 0] - // CHECK: %[[STRIDE01:.*]] = llvm.extractvalue %[[DESC0]][4, 1] - - // Extract individual values from the descriptor for the second memref. - // CHECK: %[[ALLOC1:.*]] = llvm.extractvalue %[[DESC1:.*]][0] : !llvm.struct<(ptr, ptr, i64)> - // CHECK: %[[ALIGN1:.*]] = llvm.extractvalue %[[DESC1]][1] - // CHECK: %[[OFFSET1:.*]] = llvm.extractvalue %[[DESC1]][2] - - // Forward the values to the call. - // CHECK: llvm.call @external(%[[ALLOC0]], %[[ALIGN0]], %[[OFFSET0]], %[[SIZE00]], %[[SIZE01]], %[[STRIDE00]], %[[STRIDE01]], %[[ALLOC1]], %[[ALIGN1]], %[[OFFSET1]]) : (!llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, i64) -> () - call @external(%0#0, %0#1) : (memref, memref) -> () +// RUN: mlir-opt %s \ +// RUN: --convert-memref-to-llvm \ +// RUN: --convert-std-to-llvm='max-unranked-desc-buffer-rank=5' | FileCheck %s + +func @callee_no_result(%arg0 : memref) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = memref.load %arg0[%c0, %c1] : memref return } -// CHECK-LABEL: @callee -// EMIT_C_ATTRIBUTE-LABEL: @callee -func @callee(%arg0: memref, %arg1: index) { - %0 = memref.load %arg0[%arg1] : memref +func @caller_no_result(%arg0 : memref) { + call @callee_no_result(%arg0) : (memref) -> () return } -// Verify that an interface function is emitted. -// CHECK-LABEL: @_mlir_ciface_callee -// CHECK: %[[ARG0:.*]]: !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>> - // Load the memref descriptor pointer. - // CHECK: %[[DESC:.*]] = llvm.load %[[ARG0]] : !llvm.ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>> +// CHECK-LABEL: llvm.func @caller_no_result +// CHECK-SAME: %[[ALLOC:.*]]: !llvm.ptr, %[[ALIGN:.*]]: !llvm.ptr, %[[OFFSET:.*]]: i64, %[[SIZE0:.*]]: i64, %[[SIZE1:.*]]: i64, %[[STRIDE0:.*]]: i64, %[[STRIDE1:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef : [[ARG_DESC_TY:!llvm.struct<\(ptr, ptr, i64, array<2 x i64>, array<2 x i64>\)>]] +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ALLOC]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ALIGN]], %[[ARG_DESC1]][1] +// CHECK: %[[ARG_DESC3:.*]] = llvm.insertvalue %[[OFFSET]], %[[ARG_DESC2]][2] +// CHECK: %[[ARG_DESC4:.*]] = llvm.insertvalue %[[SIZE0]], %[[ARG_DESC3]][3, 0] +// CHECK: %[[ARG_DESC5:.*]] = llvm.insertvalue %[[STRIDE0]], %[[ARG_DESC4]][4, 0] +// CHECK: %[[ARG_DESC6:.*]] = llvm.insertvalue %[[SIZE1]], %[[ARG_DESC5]][3, 1] +// CHECK: %[[ARG_DESC7:.*]] = llvm.insertvalue %[[STRIDE1]], %[[ARG_DESC6]][4, 1] + +// Unpack descriptor. +// CHECK: %[[ALLOC_:.*]] = llvm.extractvalue %[[ARG_DESC7]][0] +// CHECK: %[[ALIGN_:.*]] = llvm.extractvalue %[[ARG_DESC7]][1] +// CHECK: %[[OFFSET_:.*]] = llvm.extractvalue %[[ARG_DESC7]][2] +// CHECK: %[[SIZE0_:.*]] = llvm.extractvalue %[[ARG_DESC7]][3, 0] +// CHECK: %[[SIZE1_:.*]] = llvm.extractvalue %[[ARG_DESC7]][3, 1] +// CHECK: %[[STRIDE0_:.*]] = llvm.extractvalue %[[ARG_DESC7]][4, 0] +// CHECK: %[[STRIDE1_:.*]] = llvm.extractvalue %[[ARG_DESC7]][4, 1] + +// Call the function. +// CHECK: llvm.call @callee_no_result(%[[ALLOC_]], %[[ALIGN_]], %[[OFFSET_]], %[[SIZE0_]], %[[SIZE1_]], %[[STRIDE0_]], %[[STRIDE1_]]) +// CHECK: llvm.return + + +func @callee_single_result(%arg0 : memref) -> memref { + return %arg0 : memref +} + +func @caller_single_result(%arg0 : memref) -> memref { + %0 = call @callee_single_result(%arg0) : (memref) -> memref + return %0 : memref +} - // Extract individual components of the descriptor. - // CHECK: %[[ALLOC:.*]] = llvm.extractvalue %[[DESC]][0] - // CHECK: %[[ALIGN:.*]] = llvm.extractvalue %[[DESC]][1] - // CHECK: %[[OFFSET:.*]] = llvm.extractvalue %[[DESC]][2] - // CHECK: %[[SIZE:.*]] = llvm.extractvalue %[[DESC]][3, 0] - // CHECK: %[[STRIDE:.*]] = llvm.extractvalue %[[DESC]][4, 0] +// CHECK-LABEL: llvm.func @caller_single_result +// CHECK-SAME: %[[ALLOC:.*]]: !llvm.ptr, %[[ALIGN:.*]]: !llvm.ptr, %[[OFFSET:.*]]: i64, %[[SIZE0:.*]]: i64, %[[STRIDE0:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ALLOC]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ALIGN]], %[[ARG_DESC1]][1] +// CHECK: %[[ARG_DESC3:.*]] = llvm.insertvalue %[[OFFSET]], %[[ARG_DESC2]][2] +// CHECK: %[[ARG_DESC4:.*]] = llvm.insertvalue %[[SIZE0]], %[[ARG_DESC3]][3, 0] +// CHECK: %[[ARG_DESC5:.*]] = llvm.insertvalue %[[STRIDE0]], %[[ARG_DESC4]][4, 0] + +// Unpack descriptor. +// CHECK: %[[ALLOC_:.*]] = llvm.extractvalue %[[ARG_DESC5]][0] +// CHECK: %[[ALIGN_:.*]] = llvm.extractvalue %[[ARG_DESC5]][1] +// CHECK: %[[OFFSET_:.*]] = llvm.extractvalue %[[ARG_DESC5]][2] +// CHECK: %[[SIZE0_:.*]] = llvm.extractvalue %[[ARG_DESC5]][3, 0] +// CHECK: %[[STRIDE0_:.*]] = llvm.extractvalue %[[ARG_DESC5]][4, 0] + +// Call the function. +// CHECK: %[[RESULT:.*]] = llvm.call @callee_single_result(%[[ALLOC_]], %[[ALIGN_]], %[[OFFSET_]], %[[SIZE0_]], %[[STRIDE0_]]) +// CHECK: llvm.return %[[RESULT]] + + +func @callee_multiple_result(%arg0 : memref, + %arg1 : memref) -> (memref, memref, i64, f32) { + %c3 = constant 3 : i64 + %pi = constant 3.141 : f32 + return %arg0, %arg1, %c3, %pi : memref, memref, i64, f32 +} + +func @caller_multiple_result(%arg0 : memref, %arg1 : memref) + -> (memref, memref, i64, f32) { + %0:4 = call @callee_multiple_result(%arg0, %arg1) + : (memref, memref) + -> (memref, memref, i64, f32) + return %0#0, %0#1, %0#2, %0#3 : memref, memref, i64, f32 +} - // Forward the descriptor components to the call. - // CHECK: llvm.call @callee(%[[ALLOC]], %[[ALIGN]], %[[OFFSET]], %[[SIZE]], %[[STRIDE]], %{{.*}}) : (!llvm.ptr, !llvm.ptr, i64, i64, i64, i64) -> () +// CHECK-LABEL: llvm.func @caller_multiple_result +// CHECK-SAME: %[[ALLOC0:.*]]: !llvm.ptr, %[[ALIGN0:.*]]: !llvm.ptr, %[[OFFSET0:.*]]: i64, %[[SIZE00:.*]]: i64, %[[SIZE01:.*]]: i64, %[[STRIDE00:.*]]: i64, %[[STRIDE01:arg6]]: i64, +// CHECK-SAME: %[[ALLOC1:.*]]: !llvm.ptr, %[[ALIGN1:.*]]: !llvm.ptr, %[[OFFSET1:.*]]: i64, %[[SIZE10:.*]]: i64, %[[STRIDE10:arg11]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ALLOC0]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ALIGN0]], %[[ARG0_DESC1]][1] +// CHECK: %[[ARG0_DESC3:.*]] = llvm.insertvalue %[[OFFSET0]], %[[ARG0_DESC2]][2] +// CHECK: %[[ARG0_DESC4:.*]] = llvm.insertvalue %[[SIZE00]], %[[ARG0_DESC3]][3, 0] +// CHECK: %[[ARG0_DESC5:.*]] = llvm.insertvalue %[[STRIDE00]], %[[ARG0_DESC4]][4, 0] +// CHECK: %[[ARG0_DESC6:.*]] = llvm.insertvalue %[[SIZE01]], %[[ARG0_DESC5]][3, 1] +// CHECK: %[[ARG0_DESC7:.*]] = llvm.insertvalue %[[STRIDE01]], %[[ARG0_DESC6]][4, 1] + +// Populate the descriptor for arg1. +// CHECK: %[[ARG1_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG1_DESC1:.*]] = llvm.insertvalue %[[ALLOC1]], %[[ARG1_DESC0]][0] +// CHECK: %[[ARG1_DESC2:.*]] = llvm.insertvalue %[[ALIGN1]], %[[ARG1_DESC1]][1] +// CHECK: %[[ARG1_DESC3:.*]] = llvm.insertvalue %[[OFFSET1]], %[[ARG1_DESC2]][2] +// CHECK: %[[ARG1_DESC4:.*]] = llvm.insertvalue %[[SIZE10]], %[[ARG1_DESC3]][3, 0] +// CHECK: %[[ARG1_DESC5:.*]] = llvm.insertvalue %[[STRIDE10]], %[[ARG1_DESC4]][4, 0] + +// Unpack descriptor. +// CHECK: %[[ALLOC0_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][0] +// CHECK: %[[ALIGN0_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][1] +// CHECK: %[[OFFSET0_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][2] +// CHECK: %[[SIZE00_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][3, 0] +// CHECK: %[[SIZE01_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][3, 1] +// CHECK: %[[STRIDE00_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][4, 0] +// CHECK: %[[STRIDE01_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][4, 1] + +// Unpack descriptor. +// CHECK: %[[ALLOC1_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][0] +// CHECK: %[[ALIGN1_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][1] +// CHECK: %[[OFFSET1_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][2] +// CHECK: %[[SIZE10_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][3, 0] +// CHECK: %[[STRIDE10_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][4, 0] + +// Call the function. +// CHECK: %[[RESULT:.*]] = llvm.call @callee_multiple_result(%[[ALLOC0_]], %[[ALIGN0_]], %[[OFFSET0_]], %[[SIZE00_]], %[[SIZE01_]], %[[STRIDE00_]], %[[STRIDE01_]], %[[ALLOC1_]], %[[ALIGN1_]], %[[OFFSET1_]], %[[SIZE10_]], %[[STRIDE10_]]) + +// Unpack results. +// CHECK: %[[RESULT0:.*]] = llvm.extractvalue %[[RESULT]][0] +// CHECK: %[[RESULT1:.*]] = llvm.extractvalue %[[RESULT]][1] +// CHECK: %[[RESULT2:.*]] = llvm.extractvalue %[[RESULT]][2] +// CHECK: %[[RESULT3:.*]] = llvm.extractvalue %[[RESULT]][3] + +// Re-pack results. +// CHECK: %[[REPACKED0:.*]] = llvm.mlir.undef +// CHECK: %[[REPACKED1:.*]] = llvm.insertvalue %[[RESULT0]], %[[REPACKED0]][0] +// CHECK: %[[REPACKED2:.*]] = llvm.insertvalue %[[RESULT1]], %[[REPACKED1]][1] +// CHECK: %[[REPACKED3:.*]] = llvm.insertvalue %[[RESULT2]], %[[REPACKED2]][2] +// CHECK: %[[REPACKED4:.*]] = llvm.insertvalue %[[RESULT3]], %[[REPACKED3]][3] + +// CHECK: llvm.return %[[REPACKED4]] + + +func @callee_multiple_args(%arg0 : index, %arg1 : memref, + %arg2 : memref, %arg3 : f32) { + %c0 = constant 0 : index + %0 = memref.load %arg1[%c0, %arg0] : memref + %1 = memref.load %arg2[%arg0] : memref + return +} -// EMIT_C_ATTRIBUTE-NOT: @mlir_ciface_callee +func @caller_multiple_args(%arg0 : index, %arg1 : memref, + %arg2 : memref, %arg3 : f32) { + call @callee_multiple_args(%arg0, %arg1, %arg2, %arg3) + : (index, memref, memref, f32) -> () + return +} -// CHECK-LABEL: @other_callee -// EMIT_C_ATTRIBUTE-LABEL: @other_callee -func @other_callee(%arg0: memref, %arg1: index) attributes { llvm.emit_c_interface } { - %0 = memref.load %arg0[%arg1] : memref +// CHECK-LABEL: llvm.func @caller_multiple_args +// CHECK-SAME: %[[IARG:arg0]]: i64, +// CHECK-SAME: %[[ALLOC0:.*]]: !llvm.ptr, %[[ALIGN0:.*]]: !llvm.ptr, %[[OFFSET0:.*]]: i64, %[[SIZE00:.*]]: i64, %[[SIZE01:.*]]: i64, %[[STRIDE00:.*]]: i64, %[[STRIDE01:arg7]]: i64, +// CHECK-SAME: %[[ALLOC1:.*]]: !llvm.ptr, %[[ALIGN1:.*]]: !llvm.ptr, %[[OFFSET1:.*]]: i64, %[[SIZE10:.*]]: i64, %[[STRIDE10:arg12]]: i64, +// CHECK-SAME: %[[FARG:arg13]]: f32 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ALLOC0]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ALIGN0]], %[[ARG0_DESC1]][1] +// CHECK: %[[ARG0_DESC3:.*]] = llvm.insertvalue %[[OFFSET0]], %[[ARG0_DESC2]][2] +// CHECK: %[[ARG0_DESC4:.*]] = llvm.insertvalue %[[SIZE00]], %[[ARG0_DESC3]][3, 0] +// CHECK: %[[ARG0_DESC5:.*]] = llvm.insertvalue %[[STRIDE00]], %[[ARG0_DESC4]][4, 0] +// CHECK: %[[ARG0_DESC6:.*]] = llvm.insertvalue %[[SIZE01]], %[[ARG0_DESC5]][3, 1] +// CHECK: %[[ARG0_DESC7:.*]] = llvm.insertvalue %[[STRIDE01]], %[[ARG0_DESC6]][4, 1] + +// Populate the descriptor for arg1. +// CHECK: %[[ARG1_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG1_DESC1:.*]] = llvm.insertvalue %[[ALLOC1]], %[[ARG1_DESC0]][0] +// CHECK: %[[ARG1_DESC2:.*]] = llvm.insertvalue %[[ALIGN1]], %[[ARG1_DESC1]][1] +// CHECK: %[[ARG1_DESC3:.*]] = llvm.insertvalue %[[OFFSET1]], %[[ARG1_DESC2]][2] +// CHECK: %[[ARG1_DESC4:.*]] = llvm.insertvalue %[[SIZE10]], %[[ARG1_DESC3]][3, 0] +// CHECK: %[[ARG1_DESC5:.*]] = llvm.insertvalue %[[STRIDE10]], %[[ARG1_DESC4]][4, 0] + +// Unpack descriptor. +// CHECK: %[[ALLOC0_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][0] +// CHECK: %[[ALIGN0_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][1] +// CHECK: %[[OFFSET0_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][2] +// CHECK: %[[SIZE00_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][3, 0] +// CHECK: %[[SIZE01_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][3, 1] +// CHECK: %[[STRIDE00_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][4, 0] +// CHECK: %[[STRIDE01_:.*]] = llvm.extractvalue %[[ARG0_DESC7]][4, 1] + +// Unpack descriptor. +// CHECK: %[[ALLOC1_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][0] +// CHECK: %[[ALIGN1_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][1] +// CHECK: %[[OFFSET1_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][2] +// CHECK: %[[SIZE10_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][3, 0] +// CHECK: %[[STRIDE10_:.*]] = llvm.extractvalue %[[ARG1_DESC5]][4, 0] + +// Call the function. +// CHECK: llvm.call @callee_multiple_args(%[[IARG]], %[[ALLOC0_]], %[[ALIGN0_]], %[[OFFSET0_]], %[[SIZE00_]], %[[SIZE01_]], %[[STRIDE00_]], %[[STRIDE01_]], %[[ALLOC1_]], %[[ALIGN1_]], %[[OFFSET1_]], %[[SIZE10_]], %[[STRIDE10_]], %[[FARG]]) +// CHECK: llvm.return + + +func @callee_no_result_unranked(%arg0 : memref<*xf32>) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = memref.cast %arg0 : memref<*xf32> to memref + %1 = memref.load %0[%c0, %c1] : memref return } -// CHECK: @_mlir_ciface_other_callee -// CHECK: llvm.call @other_callee - -// EMIT_C_ATTRIBUTE: @_mlir_ciface_other_callee -// EMIT_C_ATTRIBUTE: llvm.call @other_callee - -//===========================================================================// -// Calling convention on returning unranked memrefs. -//===========================================================================// - -// CHECK-LABEL: llvm.func @return_var_memref_caller -func @return_var_memref_caller(%arg0: memref<4x3xf32>) { - // CHECK: %[[CALL_RES:.*]] = llvm.call @return_var_memref - %0 = call @return_var_memref(%arg0) : (memref<4x3xf32>) -> memref<*xf32> - - // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1 : index) - // CHECK: %[[TWO:.*]] = llvm.mlir.constant(2 : index) - // These sizes may depend on the data layout, not matching specific values. - // CHECK: %[[PTR_SIZE:.*]] = llvm.mlir.constant - // CHECK: %[[IDX_SIZE:.*]] = llvm.mlir.constant - - // CHECK: %[[DOUBLE_PTR_SIZE:.*]] = llvm.mul %[[TWO]], %[[PTR_SIZE]] - // CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RES]][0] : !llvm.struct<(i64, ptr)> - // CHECK: %[[DOUBLE_RANK:.*]] = llvm.mul %[[TWO]], %[[RANK]] - // CHECK: %[[DOUBLE_RANK_INC:.*]] = llvm.add %[[DOUBLE_RANK]], %[[ONE]] - // CHECK: %[[TABLES_SIZE:.*]] = llvm.mul %[[DOUBLE_RANK_INC]], %[[IDX_SIZE]] - // CHECK: %[[ALLOC_SIZE:.*]] = llvm.add %[[DOUBLE_PTR_SIZE]], %[[TABLES_SIZE]] - // CHECK: %[[FALSE:.*]] = llvm.mlir.constant(false) - // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOC_SIZE]] x i8 - // CHECK: %[[SOURCE:.*]] = llvm.extractvalue %[[CALL_RES]][1] - // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SOURCE]], %[[ALLOC_SIZE]], %[[FALSE]]) - // CHECK: llvm.call @free(%[[SOURCE]]) - // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> - // CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RES]][0] : !llvm.struct<(i64, ptr)> - // CHECK: %[[DESC_1:.*]] = llvm.insertvalue %[[RANK]], %[[DESC]][0] - // CHECK: llvm.insertvalue %[[ALLOCA]], %[[DESC_1]][1] +func @caller_no_result_unranked(%arg0 : memref<*xf32>) { + call @callee_no_result_unranked(%arg0) : (memref<*xf32>) -> () return } -// CHECK-LABEL: llvm.func @return_var_memref -func @return_var_memref(%arg0: memref<4x3xf32>) -> memref<*xf32> attributes { llvm.emit_c_interface } { - // Match the construction of the unranked descriptor. - // CHECK: %[[ALLOCA:.*]] = llvm.alloca - // CHECK: %[[MEMORY:.*]] = llvm.bitcast %[[ALLOCA]] - // CHECK: %[[DESC_0:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> - // CHECK: %[[DESC_1:.*]] = llvm.insertvalue %{{.*}}, %[[DESC_0]][0] - // CHECK: %[[DESC_2:.*]] = llvm.insertvalue %[[MEMORY]], %[[DESC_1]][1] - %0 = memref.cast %arg0: memref<4x3xf32> to memref<*xf32> - - // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1 : index) - // CHECK: %[[TWO:.*]] = llvm.mlir.constant(2 : index) - // These sizes may depend on the data layout, not matching specific values. - // CHECK: %[[PTR_SIZE:.*]] = llvm.mlir.constant - // CHECK: %[[IDX_SIZE:.*]] = llvm.mlir.constant - - // CHECK: %[[DOUBLE_PTR_SIZE:.*]] = llvm.mul %[[TWO]], %[[PTR_SIZE]] - // CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_2]][0] : !llvm.struct<(i64, ptr)> - // CHECK: %[[DOUBLE_RANK:.*]] = llvm.mul %[[TWO]], %[[RANK]] - // CHECK: %[[DOUBLE_RANK_INC:.*]] = llvm.add %[[DOUBLE_RANK]], %[[ONE]] - // CHECK: %[[TABLES_SIZE:.*]] = llvm.mul %[[DOUBLE_RANK_INC]], %[[IDX_SIZE]] - // CHECK: %[[ALLOC_SIZE:.*]] = llvm.add %[[DOUBLE_PTR_SIZE]], %[[TABLES_SIZE]] - // CHECK: %[[FALSE:.*]] = llvm.mlir.constant(false) - // CHECK: %[[ALLOCATED:.*]] = llvm.call @malloc(%[[ALLOC_SIZE]]) - // CHECK: %[[SOURCE:.*]] = llvm.extractvalue %[[DESC_2]][1] - // CHECK: "llvm.intr.memcpy"(%[[ALLOCATED]], %[[SOURCE]], %[[ALLOC_SIZE]], %[[FALSE]]) - // CHECK: %[[NEW_DESC:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> - // CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_2]][0] : !llvm.struct<(i64, ptr)> - // CHECK: %[[NEW_DESC_1:.*]] = llvm.insertvalue %[[RANK]], %[[NEW_DESC]][0] - // CHECK: %[[NEW_DESC_2:.*]] = llvm.insertvalue %[[ALLOCATED]], %[[NEW_DESC_1]][1] - // CHECK: llvm.return %[[NEW_DESC_2]] +// CHECK-LABEL: llvm.func @caller_no_result_unranked +// CHECK-SAME: %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// Unpack descriptor. +// CHECK: %[[ARG_RANK_:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[ARG_INNER_DESC_:.*]] = llvm.extractvalue %[[ARG_DESC2]][1] + +// Call the function. +// CHECK: llvm.call @callee_no_result_unranked(%[[ARG_RANK_]], %[[ARG_INNER_DESC_]]) +// CHECK: llvm.return + + +func @callee_single_result_unranked(%arg0 : memref<*xf32>) -> memref<*xf32> { + return %arg0 : memref<*xf32> +} + +func @caller_single_result_unranked(%arg0 : memref<*xf32>) -> memref<*xf32> { + %0 = call @callee_single_result_unranked(%arg0) + : (memref<*xf32>) -> memref<*xf32> return %0 : memref<*xf32> } -// Check that the result memref is passed as parameter -// CHECK-LABEL: @_mlir_ciface_return_var_memref -// CHECK-SAME: (%{{.*}}: !llvm.ptr)>>, %{{.*}}: !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>) - -// CHECK-LABEL: llvm.func @return_two_var_memref_caller -func @return_two_var_memref_caller(%arg0: memref<4x3xf32>) { - // Only check that we create two different descriptors using different - // memory, and deallocate both sources. The size computation is same as for - // the single result. - // CHECK: %[[CALL_RES:.*]] = llvm.call @return_two_var_memref - // CHECK: %[[RES_1:.*]] = llvm.extractvalue %[[CALL_RES]][0] - // CHECK: %[[RES_2:.*]] = llvm.extractvalue %[[CALL_RES]][1] - %0:2 = call @return_two_var_memref(%arg0) : (memref<4x3xf32>) -> (memref<*xf32>, memref<*xf32>) - - // CHECK: %[[ALLOCA_1:.*]] = llvm.alloca %{{.*}} x i8 - // CHECK: %[[SOURCE_1:.*]] = llvm.extractvalue %[[RES_1:.*]][1] : ![[DESC_TYPE:.*]] - // CHECK: "llvm.intr.memcpy"(%[[ALLOCA_1]], %[[SOURCE_1]], %{{.*}}, %[[FALSE:.*]]) - // CHECK: llvm.call @free(%[[SOURCE_1]]) - // CHECK: %[[DESC_1:.*]] = llvm.mlir.undef : ![[DESC_TYPE]] - // CHECK: %[[DESC_11:.*]] = llvm.insertvalue %{{.*}}, %[[DESC_1]][0] - // CHECK: llvm.insertvalue %[[ALLOCA_1]], %[[DESC_11]][1] - - // CHECK: %[[ALLOCA_2:.*]] = llvm.alloca %{{.*}} x i8 - // CHECK: %[[SOURCE_2:.*]] = llvm.extractvalue %[[RES_2:.*]][1] - // CHECK: "llvm.intr.memcpy"(%[[ALLOCA_2]], %[[SOURCE_2]], %{{.*}}, %[[FALSE]]) - // CHECK: llvm.call @free(%[[SOURCE_2]]) - // CHECK: %[[DESC_2:.*]] = llvm.mlir.undef : ![[DESC_TYPE]] - // CHECK: %[[DESC_21:.*]] = llvm.insertvalue %{{.*}}, %[[DESC_2]][0] - // CHECK: llvm.insertvalue %[[ALLOCA_2]], %[[DESC_21]][1] +// CHECK-LABEL: llvm.func @caller_single_result_unranked +// CHECK-SAME: %[[RESULT_INNER_DESC_BUFFER:.*]]: !llvm.ptr, %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// Allocate descriptor buffers on the stack. +// CHECK: %[[DEFAULT_DESC_BUFFER_SIZE:.*]] = llvm.mlir.constant(104 : index) +// CHECK: %[[CALL_INNER_DESC_BUFFER:.*]] = llvm.alloca %[[DEFAULT_DESC_BUFFER_SIZE]] x i8 + +// Unpack descriptor. +// CHECK: %[[ARG_RANK_:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[ARG_INNER_DESC_:.*]] = llvm.extractvalue %[[ARG_DESC2]][1] + +// Call the function. +// CHECK: %[[CALL_RESULT_DESC:.*]] = llvm.call @callee_single_result_unranked(%[[CALL_INNER_DESC_BUFFER]], %[[ARG_RANK_]], %[[ARG_INNER_DESC_]]) + +// Common constant. +// CHECK: %[[MAX_SUPPORTED_RANK:.*]] = llvm.mlir.constant(5 : i64) + +// Check if the inner descriptor fits into the buffer argument. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK]] +// CHECK: llvm.cond_br %[[PRED]], ^bb1(%[[CALL_RESULT_DESC]] : !llvm.struct<(i64, ptr)>), ^bb3 + +// At this point, we have the call result descriptor or its copy. In both cases +// the descriptor, including its inner descriptor, is on the stack. +// To return it, we still have to copy it to the descriptor buffer or to +// dynamically allocated memory. +// CHECK: ^bb1(%[[DESC_OR_CPY:.*]]: !llvm.struct<(i64, ptr)>): + +// Common constant. +// CHECK: %[[MAX_SUPPORTED_RANK_:.*]] = llvm.mlir.constant(5 : i64) + +// Compute the final result's inner descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_OR_CPY]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[RESULT_INNER_DESC_SIZE:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Check if the inner descriptor fits into the stack-allocated buffer argument. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_OR_CPY]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK_]] +// CHECK: llvm.cond_br %[[PRED]], ^bb4, ^bb5 + +// Copy the inner descriptor to the selected buffer and return a copy of the +// unranked outer descriptor. +// CHECK: ^bb2(%[[SELECTED_BUFFER:.*]]: !llvm.ptr): +// CHECK: %[[CALL_RESULT_INNER_DESC:.*]] = llvm.extractvalue %[[DESC_OR_CPY]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[SELECTED_BUFFER]], %[[CALL_RESULT_INNER_DESC]], %[[RESULT_INNER_DESC_SIZE]], %[[C0]]) +// CHECK: %[[RESULT_DESC0:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> +// CHECK: %[[RESULT_DESC1:.*]] = llvm.insertvalue %[[RANK]], %[[RESULT_DESC0]][0] +// CHECK: %[[RESULT_DESC2:.*]] = llvm.insertvalue %[[SELECTED_BUFFER]], %[[RESULT_DESC1]][1] +// CHECK: llvm.return %[[RESULT_DESC2]] + +// Copy the call result descriptor to stack-allocated memory. +// This is the case in which it did not fit into the pre-allocated buffer. We +// have to free the dynamically allocated inner descriptor and copy it over to +// the stack. +// CHECK: ^bb3: + +// Compute the call result's inner descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[CALL_RESULT_INNER_DESC_SIZE:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Stack-allocate a buffer for the call result's inner descriptor and copy it +// over. Also, free the previously dynamically allocated inner descriptor. +// CHECK: %[[INNER_DESC:.*]] = llvm.alloca %[[CALL_RESULT_INNER_DESC_SIZE]] x i8 +// CHECK: %[[DYN_INNER_DESC:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[INNER_DESC]], %[[DYN_INNER_DESC]], %[[CALL_RESULT_INNER_DESC_SIZE]], %[[C0]]) +// CHECK: llvm.call @free(%[[DYN_INNER_DESC]]) +// CHECK: %[[CALL_RESULT_DESC_CPY0:.*]] = llvm.mlir.undef +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC]][0] +// CHECK: %[[CALL_RESULT_DESC_CPY1:.*]] = llvm.insertvalue %[[RANK]], %[[CALL_RESULT_DESC_CPY0]][0] +// CHECK: %[[CALL_RESULT_DESC_CPY2:.*]] = llvm.insertvalue %[[INNER_DESC]], %[[CALL_RESULT_DESC_CPY1]][1] +// CHECK: llvm.br ^bb1(%[[CALL_RESULT_DESC_CPY2]] : !llvm.struct<(i64, ptr)>) + +// Select the buffer argument to copy the result's inner descriptor to. +// CHECK: ^bb4: +// CHECK: llvm.br ^bb2(%[[RESULT_INNER_DESC_BUFFER]] : !llvm.ptr) + +// Dynamically allocate a new buffer to copy the result's inner descriptor to. +// CHECK: ^bb5: +// CHECK: %[[NEW_BUFFER:.*]] = llvm.call @malloc(%[[RESULT_INNER_DESC_SIZE]]) +// CHECK: llvm.br ^bb2(%[[NEW_BUFFER]] : !llvm.ptr) + + +func @callee_multiple_result_unranked(%arg0 : memref<*xf32>) -> (f32, i64, + memref<*xf32>, memref<*xf32>) { + %pi = constant 3.141 : f32 + %c3 = constant 3 : i64 + return %pi, %c3, %arg0, %arg0 : f32, i64, memref<*xf32>, memref<*xf32> +} + +func @caller_multiple_result_unranked(%arg0 : memref<*xf32>) + -> (f32, i64, memref<*xf32>, memref<*xf32>) { + %0:4 = call @callee_multiple_result_unranked(%arg0) : (memref<*xf32>) + -> (f32, i64, memref<*xf32>, memref<*xf32>) + return %0#0, %0#1, %0#2, %0#3 : f32, i64, memref<*xf32>, memref<*xf32> +} + +// CHECK-LABEL: llvm.func @caller_multiple_result_unranked +// CHECK-SAME: %[[RESULT_INNER_DESC_BUFFER0:arg0]]: !llvm.ptr, +// CHECK-SAME: %[[RESULT_INNER_DESC_BUFFER1:arg1]]: !llvm.ptr, +// CHECK-SAME: %[[ARG_RANK:.*]]: i64, %[[ARG_INNER_DESC:.*]]: !llvm.ptr + +// Populate the descriptor for arg0. +// CHECK: %[[ARG_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG_DESC1:.*]] = llvm.insertvalue %[[ARG_RANK]], %[[ARG_DESC0]][0] +// CHECK: %[[ARG_DESC2:.*]] = llvm.insertvalue %[[ARG_INNER_DESC]], %[[ARG_DESC1]][1] + +// Allocate descriptor buffers on the stack. +// CHECK: %[[DEFAULT_DESC_BUFFER_SIZE:.*]] = llvm.mlir.constant(104 : index) +// CHECK: %[[CALL_INNER_DESC_BUFFER0:.*]] = llvm.alloca %[[DEFAULT_DESC_BUFFER_SIZE]] x i8 +// CHECK: %[[CALL_INNER_DESC_BUFFER1:.*]] = llvm.alloca %[[DEFAULT_DESC_BUFFER_SIZE]] x i8 + +// Unpack descriptor. +// CHECK: %[[ARG_RANK_:.*]] = llvm.extractvalue %[[ARG_DESC2]][0] +// CHECK: %[[ARG_INNER_DESC_:.*]] = llvm.extractvalue %[[ARG_DESC2]][1] + +// Call the function. +// CHECK: %[[CALL_RESULT:.*]] = llvm.call @callee_multiple_result_unranked(%[[CALL_INNER_DESC_BUFFER0]], %[[CALL_INNER_DESC_BUFFER1]], %[[ARG_RANK_]], %[[ARG_INNER_DESC_]]) + +// Unpack call result. +// CHECK: %[[FRESULT:.*]] = llvm.extractvalue %[[CALL_RESULT]][0] +// CHECK: %[[IRESULT:.*]] = llvm.extractvalue %[[CALL_RESULT]][1] +// CHECK: %[[CALL_RESULT_DESC0:.*]] = llvm.extractvalue %[[CALL_RESULT]][2] +// CHECK: %[[CALL_RESULT_DESC1:.*]] = llvm.extractvalue %[[CALL_RESULT]][3] + +// Common constant. +// CHECK: %[[MAX_SUPPORTED_RANK:.*]] = llvm.mlir.constant(5 : i64) + +// Check if the first call result inner descriptor fits into its buffer argument +// and copy it to a new stack-allocated buffer otherwise. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC0]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK]] +// CHECK: llvm.cond_br %[[PRED]], ^bb1(%[[CALL_RESULT_DESC0]] : !llvm.struct<(i64, ptr)>), ^bb5 + +// At this point, we have the first call result descriptor or its copy. +// CHECK: ^bb1(%[[DESC_OR_CPY0:.*]]: !llvm.struct<(i64, ptr)>): + +// Check if the second call result inner descriptor fits into its buffer +// argument and copy it to a new stack-allocated buffer otherwise. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC1]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK]] +// CHECK: llvm.cond_br %[[PRED]], ^bb2(%[[CALL_RESULT_DESC1]] : !llvm.struct<(i64, ptr)>), ^bb6 + +// At this point, we have the call result descriptors or their copy. In both +// cases the descriptors, including its inner descriptors, are on the stack. +// To return them, we still have to copy them to the argument buffer or to +// dynamically allocated memory. +// CHECK: ^bb2(%[[DESC_OR_CPY1:.*]]: !llvm.struct<(i64, ptr)>): + +// Common constant. +// CHECK: %[[MAX_SUPPORTED_RANK_:.*]] = llvm.mlir.constant(5 : i64) + +// Compute the result's first inner descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_OR_CPY0]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[RESULT_INNER_DESC_SIZE0:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Check if the inner descriptor fits into the buffer argument. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_OR_CPY0]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK_]] +// CHECK: llvm.cond_br %[[PRED]], ^bb7, ^bb8 + +// Copy the call result's first inner descriptor to the selected buffer and +// create a copy of the unranked outer descriptor. +// CHECK: ^bb3(%[[SELECTED_BUFFER0:.*]]: !llvm.ptr): +// CHECK: %[[CALL_RESULT_INNER_DESC0:.*]] = llvm.extractvalue %[[DESC_OR_CPY0]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[SELECTED_BUFFER0]], %[[CALL_RESULT_INNER_DESC0]], %[[RESULT_INNER_DESC_SIZE0]], %[[C0]]) +// CHECK: %[[RESULT0_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[RESULT0_DESC1:.*]] = llvm.insertvalue %[[RANK]], %[[RESULT0_DESC0]][0] +// CHECK: %[[RESULT0_DESC2:.*]] = llvm.insertvalue %[[SELECTED_BUFFER0]], %[[RESULT0_DESC1]][1] + +// Compute the result's second inner descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_OR_CPY1]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[RESULT_INNER_DESC_SIZE1:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Check if the inner descriptor fits into the buffer argument. +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[DESC_OR_CPY1]][0] +// CHECK: %[[PRED:.*]] = llvm.icmp "ule" %[[RANK]], %[[MAX_SUPPORTED_RANK_]] +// CHECK: llvm.cond_br %[[PRED]], ^bb9, ^bb10 + +// Copy the call result's second inner descriptor to the selected buffer and +// create a copy of the unranked outer descriptor. +// CHECK: ^bb4(%[[SELECTED_BUFFER1:.*]]: !llvm.ptr): +// CHECK: %[[CALL_RESULT_INNER_DESC1:.*]] = llvm.extractvalue %[[DESC_OR_CPY1]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[SELECTED_BUFFER1]], %[[CALL_RESULT_INNER_DESC1]], %[[RESULT_INNER_DESC_SIZE1]], %[[C0]]) +// CHECK: %[[RESULT1_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[RESULT1_DESC1:.*]] = llvm.insertvalue %[[RANK]], %[[RESULT1_DESC0]][0] +// CHECK: %[[RESULT1_DESC2:.*]] = llvm.insertvalue %[[SELECTED_BUFFER1]], %[[RESULT1_DESC1]][1] + +// Pack the final result and return it. +// CHECK: %[[RESULT0:.*]] = llvm.mlir.undef +// CHECK: %[[RESULT1:.*]] = llvm.insertvalue %[[FRESULT]], %[[RESULT0]][0] +// CHECK: %[[RESULT2:.*]] = llvm.insertvalue %[[IRESULT]], %[[RESULT1]][1] +// CHECK: %[[RESULT3:.*]] = llvm.insertvalue %[[RESULT0_DESC2]], %[[RESULT2]][2] +// CHECK: %[[RESULT4:.*]] = llvm.insertvalue %[[RESULT1_DESC2]], %[[RESULT3]][3] +// CHECK: llvm.return %[[RESULT4]] + +// Copy the call result's first descriptor to stack-allocated memory. +// This is the case in which it did not fit into the pre-allocated buffer. +// CHECK: ^bb5: + +// Compute the descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC0]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[CALL_RESULT_INNER_DESC_SIZE0:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Stack-allocate a buffer for the call result's first inner descriptor and copy +// it over. Also, free the previously dynamically allocated inner descriptor. +// CHECK: %[[INNER_DESC:.*]] = llvm.alloca %[[CALL_RESULT_INNER_DESC_SIZE0]] x i8 +// CHECK: %[[DYN_INNER_DESC:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC0]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[INNER_DESC]], %[[DYN_INNER_DESC]], %[[CALL_RESULT_INNER_DESC_SIZE0]], %[[C0]]) +// CHECK: llvm.call @free(%[[DYN_INNER_DESC]]) +// CHECK: %[[CALL_RESULT_DESC0_CPY0:.*]] = llvm.mlir.undef +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC0]][0] +// CHECK: %[[CALL_RESULT_DESC0_CPY1:.*]] = llvm.insertvalue %[[RANK]], %[[CALL_RESULT_DESC0_CPY0]][0] +// CHECK: %[[CALL_RESULT_DESC0_CPY2:.*]] = llvm.insertvalue %[[INNER_DESC]], %[[CALL_RESULT_DESC0_CPY1]][1] +// CHECK: llvm.br ^bb1(%[[CALL_RESULT_DESC0_CPY2]] : !llvm.struct<(i64, ptr)>) + +// Copy the call result's second descriptor to stack-allocated memory. +// This is the case in which it did not fit into the pre-allocated buffer. +// CHECK: ^bb6: + +// Compute the descriptor size. +// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) +// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) +// CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[C8_:.*]] = llvm.mlir.constant(8 : index) +// CHECK: %[[SIZE_PTRS:.*]] = llvm.mul %[[C2]], %[[C8]] +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC1]][0] +// CHECK: %[[RANK_TWICE:.*]] = llvm.mul %[[C2]], %[[RANK]] +// CHECK: %[[NUM_I64_FIELDS:.*]] = llvm.add %[[RANK_TWICE]], %[[C1]] +// CHECK: %[[SIZE_I64_FIELDS:.*]] = llvm.mul %[[NUM_I64_FIELDS]], %[[C8_]] +// CHECK: %[[CALL_RESULT_INNER_DESC_SIZE1:.*]] = llvm.add %[[SIZE_PTRS]], %[[SIZE_I64_FIELDS]] + +// Stack-allocate a buffer for the call result's second inner descriptor and +// copy it over. Also, free the previously dynamically allocated inner +// descriptor. +// CHECK: %[[INNER_DESC:.*]] = llvm.alloca %[[CALL_RESULT_INNER_DESC_SIZE1]] x i8 +// CHECK: %[[DYN_INNER_DESC:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC1]][1] +// CHECK: %[[C0:.*]] = llvm.mlir.constant(false) +// CHECK: "llvm.intr.memcpy"(%[[INNER_DESC]], %[[DYN_INNER_DESC]], %[[CALL_RESULT_INNER_DESC_SIZE1]], %[[C0]]) +// CHECK: llvm.call @free(%[[DYN_INNER_DESC]]) +// CHECK: %[[CALL_RESULT_DESC1_CPY0:.*]] = llvm.mlir.undef +// CHECK: %[[RANK:.*]] = llvm.extractvalue %[[CALL_RESULT_DESC1]][0] +// CHECK: %[[CALL_RESULT_DESC1_CPY1:.*]] = llvm.insertvalue %[[RANK]], %[[CALL_RESULT_DESC1_CPY0]][0] +// CHECK: %[[CALL_RESULT_DESC1_CPY2:.*]] = llvm.insertvalue %[[INNER_DESC]], %[[CALL_RESULT_DESC1_CPY1]][1] +// CHECK: llvm.br ^bb2(%[[CALL_RESULT_DESC1_CPY2]] : !llvm.struct<(i64, ptr)>) + +// Select the buffer argument to copy the result's first inner descriptor to. +// CHECK: ^bb7: +// CHECK: llvm.br ^bb3(%[[RESULT_INNER_DESC_BUFFER0]] : !llvm.ptr) + +// Dynamically allocate a new buffer to copy the result's first inner descriptor +// to. +// CHECK: ^bb8: +// CHECK: %[[NEW_BUFFER:.*]] = llvm.call @malloc(%[[RESULT_INNER_DESC_SIZE0]]) +// CHECK: llvm.br ^bb3(%[[NEW_BUFFER]] : !llvm.ptr) + +// Select the buffer argument to copy the result's first inner descriptor to. +// CHECK: ^bb9: +// CHECK: llvm.br ^bb4(%[[RESULT_INNER_DESC_BUFFER1]] : !llvm.ptr) + +// Dynamically allocate a new buffer to copy the result's first inner descriptor +// to. +// CHECK: ^bb10: +// CHECK: %[[NEW_BUFFER:.*]] = llvm.call @malloc(%[[RESULT_INNER_DESC_SIZE1]]) +// CHECK: llvm.br ^bb4(%[[NEW_BUFFER]] : !llvm.ptr) + + +func @callee_multiple_args_unranked(%arg0 : memref<*xf32>, %arg1 : f32, + %arg2 : memref<*xf32>, %arg3 : index) { + %c0 = constant 0 : index + %0 = memref.cast %arg0 : memref<*xf32> to memref + %1 = memref.load %0[%c0, %arg3] : memref + %2 = memref.cast %arg2 : memref<*xf32> to memref + %3 = memref.load %2[%arg3] : memref return } -// CHECK-LABEL: llvm.func @return_two_var_memref -func @return_two_var_memref(%arg0: memref<4x3xf32>) -> (memref<*xf32>, memref<*xf32>) attributes { llvm.emit_c_interface } { - // Match the construction of the unranked descriptor. - // CHECK: %[[ALLOCA:.*]] = llvm.alloca - // CHECK: %[[MEMORY:.*]] = llvm.bitcast %[[ALLOCA]] - // CHECK: %[[DESC_0:.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> - // CHECK: %[[DESC_1:.*]] = llvm.insertvalue %{{.*}}, %[[DESC_0]][0] - // CHECK: %[[DESC_2:.*]] = llvm.insertvalue %[[MEMORY]], %[[DESC_1]][1] - %0 = memref.cast %arg0 : memref<4x3xf32> to memref<*xf32> - - // Only check that we allocate the memory for each operand of the "return" - // separately, even if both operands are the same value. The calling - // convention requires the caller to free them and the caller cannot know - // whether they are the same value or not. - // CHECK: %[[ALLOCATED_1:.*]] = llvm.call @malloc(%{{.*}}) - // CHECK: %[[SOURCE_1:.*]] = llvm.extractvalue %[[DESC_2]][1] - // CHECK: "llvm.intr.memcpy"(%[[ALLOCATED_1]], %[[SOURCE_1]], %{{.*}}, %[[FALSE:.*]]) - // CHECK: %[[RES_1:.*]] = llvm.mlir.undef - // CHECK: %[[RES_11:.*]] = llvm.insertvalue %{{.*}}, %[[RES_1]][0] - // CHECK: %[[RES_12:.*]] = llvm.insertvalue %[[ALLOCATED_1]], %[[RES_11]][1] - - // CHECK: %[[ALLOCATED_2:.*]] = llvm.call @malloc(%{{.*}}) - // CHECK: %[[SOURCE_2:.*]] = llvm.extractvalue %[[DESC_2]][1] - // CHECK: "llvm.intr.memcpy"(%[[ALLOCATED_2]], %[[SOURCE_2]], %{{.*}}, %[[FALSE]]) - // CHECK: %[[RES_2:.*]] = llvm.mlir.undef - // CHECK: %[[RES_21:.*]] = llvm.insertvalue %{{.*}}, %[[RES_2]][0] - // CHECK: %[[RES_22:.*]] = llvm.insertvalue %[[ALLOCATED_2]], %[[RES_21]][1] - - // CHECK: %[[RESULTS:.*]] = llvm.mlir.undef : !llvm.struct<(struct<(i64, ptr)>, struct<(i64, ptr)>)> - // CHECK: %[[RESULTS_1:.*]] = llvm.insertvalue %[[RES_12]], %[[RESULTS]] - // CHECK: %[[RESULTS_2:.*]] = llvm.insertvalue %[[RES_22]], %[[RESULTS_1]] - // CHECK: llvm.return %[[RESULTS_2]] - return %0, %0 : memref<*xf32>, memref<*xf32> -} - -// Check that the result memrefs are passed as parameter -// CHECK-LABEL: @_mlir_ciface_return_two_var_memref -// CHECK-SAME: (%{{.*}}: !llvm.ptr)>, struct<(i64, ptr)>)>>, -// CHECK-SAME: %{{.*}}: !llvm.ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>>) +func @caller_multiple_args_unranked(%arg0 : memref<*xf32>, %arg1 : f32, + %arg2 : memref<*xf32>, %arg3 : index) { + call @callee_multiple_args_unranked(%arg0, %arg1, %arg2, %arg3) + : (memref<*xf32>, f32, memref<*xf32>, index) -> () + return +} +// CHECK-LABEL: llvm.func @caller_multiple_args_unranked +// CHECK-SAME: %[[ARG0_RANK:.*]]: i64, %[[ARG0_INNER_DESC:arg1]]: !llvm.ptr, +// CHECK-SAME: %[[FARG:arg2]]: f32, +// CHECK-SAME: %[[ARG1_RANK:.*]]: i64, %[[ARG1_INNER_DESC:arg4]]: !llvm.ptr, +// CHECK-SAME: %[[IARG:.*]]: i64 + +// Populate the descriptor for arg0. +// CHECK: %[[ARG0_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG0_DESC1:.*]] = llvm.insertvalue %[[ARG0_RANK]], %[[ARG0_DESC0]][0] +// CHECK: %[[ARG0_DESC2:.*]] = llvm.insertvalue %[[ARG0_INNER_DESC]], %[[ARG0_DESC1]][1] + +// Populate the descriptor for arg2. +// CHECK: %[[ARG1_DESC0:.*]] = llvm.mlir.undef +// CHECK: %[[ARG1_DESC1:.*]] = llvm.insertvalue %[[ARG1_RANK]], %[[ARG1_DESC0]][0] +// CHECK: %[[ARG1_DESC2:.*]] = llvm.insertvalue %[[ARG1_INNER_DESC]], %[[ARG1_DESC1]][1] + +// Unpack descriptor for arg0. +// CHECK: %[[ARG0_RANK:.*]] = llvm.extractvalue %[[ARG0_DESC2]][0] +// CHECK: %[[ARG0_INNER_DESC:.*]] = llvm.extractvalue %[[ARG0_DESC2]][1] + +// Unpack descriptor for arg2. +// CHECK: %[[ARG1_RANK:.*]] = llvm.extractvalue %[[ARG1_DESC2]][0] +// CHECK: %[[ARG1_INNER_DESC:.*]] = llvm.extractvalue %[[ARG1_DESC2]][1] + +// Call the function and return. +// CHECK: llvm.call @callee_multiple_args_unranked(%[[ARG0_RANK]], %[[ARG0_INNER_DESC]], %[[FARG]], %[[ARG1_RANK]], %[[ARG1_INNER_DESC]], %[[IARG]]) +// CHECK: llvm.return