diff --git a/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h @@ -0,0 +1,26 @@ +//===- AMDGPUToROCDL.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_ +#define MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_ + +#include + +namespace mlir { + +class LLVMTypeConverter; +class RewritePatternSet; +class Pass; + +void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns); + +std::unique_ptr createConvertAMDGPUToROCDLPass(); + +} // namespace mlir + +#endif // MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_ diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -9,6 +9,7 @@ #ifndef MLIR_CONVERSION_PASSES_H #define MLIR_CONVERSION_PASSES_H +#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h" #include "mlir/Conversion/ArithmeticToSPIRV/ArithmeticToSPIRV.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -73,6 +73,22 @@ ]; } +//===----------------------------------------------------------------------===// +// AMDGPUToROCDL +//===----------------------------------------------------------------------===// + +def ConvertAMDGPUToROCDL : Pass<"convert-amdgpu-to-rocdl"> { + let summary = "Convert AMDGPU dialect to ROCDL dialect"; + let description = [{ + This pass converts supported AMDGPU ops to ROCDL dialect intrinsics. + }]; + let constructor = "mlir::createConvertAMDGPUToROCDLPass()"; + let dependentDialects = [ + "LLVM::LLVMDialect", + "ROCDL::ROCDLDialect", + ]; +} + //===----------------------------------------------------------------------===// // ArithmeticToLLVM //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td @@ -0,0 +1,166 @@ +//===-- AMDGPU.td - AMDGPU dialect definitions *- tablegen -*------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU +#define AMDGPU + +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/OpBase.td" + +def AMDGPU_Dialect : Dialect { + let name = "amdgpu"; + let cppNamespace = "::mlir::amdgpu"; + let description = [{ + The `AMDGPU` dialect provides wrappers around AMD-specific functionality + and LLVM intrinsics. These wrappers should be used in conjunction with + more generic dialects, such as `gpu` and `vector`, when generating LLVM IR + that will eventually be executed on AMD hardware. + }]; +} + +//===----------------------------------------------------------------------===// +// AMDGPU Op definitions +//===----------------------------------------------------------------------===// + +class AMDGPU_Op traits = []> : + Op {} + +/// Raw buffer load +def AMDGPU_RawBufferLoadOp : + AMDGPU_Op<"raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>, + AttrSizedOperandSegments]>, + Arguments<(ins Arg:$memref, + BoolAttr:$targetIsRDNA, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)>, + Results<(outs AnyTypeOf<[BF16, F16, F32, I32, I8, + VectorOfLengthAndType<[2, 4], [F32, I32]>, + VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>, + VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value)> { + + let summary = "Raw Buffer load, exposing GCN features"; + let description = [{ + The `amdgpu.raw_buffer_load` op is a wrapper around the buffer load intrinsics + available on AMD GPUs, including extensions in newer GPUs. + + The index into the buffer is computed as for `memref.load` with the additon + of `indexOffset` and `sgprOffset` (which is added after bounds checks and + includes any offset present on the memref type if it's non-zero). + + All indices and offsets are in units of the memref's data type and are + converted to bytes during lowering. + + When a load is out of bounds, the instruction returns zero. + Vector instructions bounds check each component's address. + + The memref struct is converted into a buffer resource (a V#) and the arguments + are translated to intrinsic arguments as follows: + - The base address of the buffer is the base address of the memref + - The stride is 0 to enable raw mode + - The number of records is the size of the memref, in bytes + In the case of dynamically-shaped memrefs, this is computed at runtime + as max_d (size(d) * stride(d)) * sizeof(elementType(memref)) + - The offset enable bit is 1, the index enable bit is 0. + - The thread ID addition bit is off + - If `boundsCheck` is false and the target is RDNA, OOB_SELECT is set to 2 + to disable bounds checks, otherwise it is 0 + - The cache coherency bits are off + - `targetIsRDNA` controls the setting of some reserved values that differ + between RDNA and CDNA cores + }]; + let assemblyFormat = [{ + attr-dict $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($memref) `,` type($indices) `->` type($value) + }]; + let hasVerifier = 1; +} + +/// Raw buffer store +def AMDGPU_RawBufferStoreOp : + AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>, + AttrSizedOperandSegments]>, + Arguments<(ins AnyTypeOf<[BF16, F16, F32, I32, I8, + VectorOfLengthAndType<[2, 4], [F32, I32]>, + VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>, + VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value, + Arg:$memref, + BoolAttr:$targetIsRDNA, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)> { + + let summary = "Raw Buffer Store, exposing GCN features"; + let description = [{ + The `amdgpu.raw_buffer_store` op is a wrapper around the buffer store + intrinsics available on AMD GPUs, including extensions in newer GPUs. + + The store index is computed as in `memref.store` with the addition of + `indexOffset` (which is included for uniformity with atomics and may be useful + when writing vectorized code) and `sgprOffset` (which is added after bounds + checks and implicitly includes the offset of the memref type if non-zero). + All index components are in terms of the elements of the memref, not bytes, + and are scaled up appropriately. + + Out of bounds stores are ignored in hardware, including the out of bounds + components of vector writes. + + See `amdgpu.raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $value `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasVerifier = 1; +} + +// Raw buffer atomic floating point add +def AMDGPU_RawBufferAtomicFaddOp : + AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, + AttrSizedOperandSegments]>, + Arguments<(ins F32:$value, + Arg:$memref, + BoolAttr:$targetIsRDNA, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)> { + + let summary = "Raw Buffer Floating-point Atomic Add (MI-* only)"; + let description = [{ + The `amdgpu.raw_buffer_atomic_fadd` op is a wrapper around the + buffer-based atomic flooating point addition available on the MI-* series + of AMD GPUs. + + The index into the buffer is computed as for `memref.store` with the addition + of `indexOffset` (which is used to aid in emitting vectorized code) and, + if present `sgprOffset` (which is added after bounds checks and includes + any non-zero offset on the memref type). + + All indexing components are given in terms of the memref's element size, not + the byte lengths required by the intrinsic. + + Out of bounds atomic operations are ignored in hardware. + + See `amdgpu.raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $value `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasVerifier = 1; +} + +#endif // AMDGPU diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h @@ -0,0 +1,27 @@ +//===- AMDGPUDialect.h - MLIR Dialect for AMDGPU ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares a dialect for MLIR wrappers around AMDGPU-specific +// intrinssics and for other AMD GPU-specific functionality. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_ +#define MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_ + +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" + +#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h.inc" + +#define GET_OP_CLASSES +#include "mlir/Dialect/AMDGPU/AMDGPU.h.inc" + +#endif // MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_ diff --git a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt @@ -0,0 +1,4 @@ +add_mlir_dialect(AMDGPU amdgpu) +add_mlir_doc(AMDGPU AMDGPU Dialects/ -gen-dialect-doc) + +set(LLVM_TARGET_DEFINITIONS AMDGPU.td) diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt --- a/mlir/include/mlir/Dialect/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(Affine) +add_subdirectory(AMDGPU) add_subdirectory(Arithmetic) add_subdirectory(Async) add_subdirectory(ArmNeon) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -179,4 +179,57 @@ let hasCustomAssemblyFormat = 1; } +//===---------------------------------------------------------------------===// +// Raw buffer load/store intrinsics + +def ROCDL_RawBufferLoadOp : + ROCDL_Op<"raw.buffer.load">, + Results<(outs LLVM_Type:$res)>, + Arguments<(ins LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)> { + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_load, {$rsrc, $offset, + $soffset, $aux}, {$_resultType}); + }]; + let hasCustomAssemblyFormat = 1; +} + +def ROCDL_RawBufferStoreOp : + ROCDL_Op<"raw.buffer.store">, + Arguments<(ins LLVM_Type:$vdata, + LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)>{ + string llvmBuilder = [{ + auto vdataType = moduleTranslation.convertType(op.vdata().getType()); + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_store, {$vdata, $rsrc, + $offset, $soffset, $aux}, {vdataType}); + }]; + let hasCustomAssemblyFormat = 1; +} + +//===---------------------------------------------------------------------===// +// MI-100 and MI-200 buffer atomic floating point add intrinsic + +def ROCDL_RawBufferAtomicFAddOp : + ROCDL_Op<"raw.buffer.atomic.fadd">, + Arguments<(ins LLVM_Type:$vdata, + LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)>{ + string llvmBuilder = [{ + auto vdataType = moduleTranslation.convertType(op.vdata().getType()); + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_atomic_fadd, {$vdata, $rsrc, + $offset, $soffset, $aux}, {vdataType}); + }]; + let hasCustomAssemblyFormat = 1; +} + #endif // ROCDLIR_OPS diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -14,6 +14,7 @@ #ifndef MLIR_INITALLDIALECTS_H_ #define MLIR_INITALLDIALECTS_H_ +#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h" #include "mlir/Dialect/AMX/AMXDialect.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" @@ -69,6 +70,7 @@ registry.insert +struct RawBufferOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + static constexpr uint32_t maxVectorOpWidth = 128; + + Value createI32Constant(ConversionPatternRewriter &rewriter, Location loc, + int32_t value) const { + IntegerAttr valAttr = rewriter.getI32IntegerAttr(value); + Type llvmI32 = this->typeConverter->convertType(rewriter.getI32Type()); + return rewriter.create(loc, llvmI32, valAttr); + } + + LogicalResult + matchAndRewrite(GpuOp gpuOp, typename GpuOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = gpuOp.getLoc(); + Value memref = adaptor.memref(); + Value unconvertedMemref = gpuOp.memref(); + MemRefType memrefType = unconvertedMemref.getType().cast(); + + Value storeData = adaptor.getODSOperands(0)[0]; + if (storeData == memref) // no write component to this op + storeData = Value(); + Type wantedDataType; + if (storeData) + wantedDataType = storeData.getType(); + else + wantedDataType = gpuOp.getODSResults(0)[0].getType(); + + Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType); + + Type i32 = rewriter.getI32Type(); + Type llvmI32 = this->typeConverter->convertType(i32); + + int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8; + Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth); + + // If we want to load a vector with total size <= 32 + // bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32 + // and the + Type llvmBufferValType = llvmWantedDataType; + if (auto dataVector = wantedDataType.dyn_cast()) { + uint32_t elemBits = dataVector.getElementTypeBitWidth(); + uint32_t totalBits = elemBits * dataVector.getNumElements(); + if (totalBits > maxVectorOpWidth) + return gpuOp.emitOpError( + "Total width of loads or stores must be no more than " + + Twine(maxVectorOpWidth) + " bits, but we call for " + + Twine(totalBits) + + " bits. This should've been caught in validation"); + if (elemBits < 32) { + if (totalBits > 32) { + if (totalBits % 32 != 0) + return gpuOp.emitOpError("Load or store of more than 32-bits that " + "doesn't fit into words. Can't happen\n"); + llvmBufferValType = this->typeConverter->convertType( + VectorType::get(totalBits / 32, i32)); + } else { + llvmBufferValType = this->typeConverter->convertType( + rewriter.getIntegerType(totalBits)); + } + } + } + + SmallVector args; + if (storeData) { + if (llvmBufferValType != llvmWantedDataType) { + Value castForStore = + rewriter.create(loc, llvmBufferValType, storeData); + args.push_back(castForStore); + } else { + args.push_back(storeData); + } + } + + // Construct buffer descriptor from memref, attributes + int64_t offset = 0; + SmallVector strides; + if (failed(getStridesAndOffset(memrefType, strides, offset))) + return gpuOp.emitOpError("Can't lower non-stride-offset memrefs"); + + // Resource descriptor + // bits 0-47: base address + // bits 48-61: stride (0 for raw buffers) + // bit 62: texture cache coherency (always 0) + // bit 63: enable swizzles (always off for raw buffers) + // bits 64-95 (word 2): Number of records, units of stride + // bits 96-127 (word 3): See below + + Type llvm4xI32 = this->typeConverter->convertType(VectorType::get(4, i32)); + MemRefDescriptor memrefDescriptor(memref); + Type llvmI64 = this->typeConverter->convertType(rewriter.getI64Type()); + Type llvm2xI32 = this->typeConverter->convertType(VectorType::get(2, i32)); + + Value resource = rewriter.create(loc, llvm4xI32); + + Value ptr = memrefDescriptor.alignedPtr(rewriter, loc); + Value ptrAsInt = rewriter.create(loc, llvmI64, ptr); + Value ptrAsInts = + rewriter.create(loc, llvm2xI32, ptrAsInt); + for (int64_t i = 0; i < 2; ++i) { + Value idxConst = this->createIndexConstant(rewriter, loc, i); + Value part = + rewriter.create(loc, ptrAsInts, idxConst); + resource = rewriter.create( + loc, llvm4xI32, resource, part, idxConst); + } + + Value numRecords; + if (memrefType.hasStaticShape()) { + numRecords = createI32Constant( + rewriter, loc, + static_cast(memrefType.getNumElements() * elementByteWidth)); + } else { + Value maxIndex; + for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) { + Value size = memrefDescriptor.size(rewriter, loc, i); + Value stride = memrefDescriptor.stride(rewriter, loc, i); + stride = rewriter.create(loc, stride, byteWidthConst); + Value maxThisDim = rewriter.create(loc, size, stride); + maxIndex = maxIndex ? rewriter.create(loc, maxIndex, + maxThisDim) + : maxThisDim; + } + numRecords = rewriter.create(loc, llvmI32, maxIndex); + } + resource = rewriter.create( + loc, llvm4xI32, resource, numRecords, + this->createIndexConstant(rewriter, loc, 2)); + + // Final word: + // bits 0-11: dst sel, ignored by these intrinsics + // bits 12-14: data format (ignored, must be nonzero, 7=float) + // bits 15-18: data format (ignored, must be nonzero, 4=32bit) + // bit 19: In nested heap (0 here) + // bit 20: Behavior on unmap (0 means "return 0 / ignore") + // bits 21-22: Index stride for swizzles (N/A) + // bit 23: Add thread ID (0) + // bit 24: Reserved to 1 (RDNA) or 0 (CDNA) + // bits 25-26: Reserved (0) + // bit 27: Buffer is non-volatile (CDNA only) + // bits 28-29: Out of bounds select (0 = structured, 1 = raw, 2 = none, 3 = + // swizzles) RDNA only + // bits 30-31: Type (must be 0) + uint32_t word3 = (7 << 12) | (4 << 15); + if (adaptor.targetIsRDNA()) { + word3 |= (1 << 24); + uint32_t oob = adaptor.boundsCheck() ? 1 : 2; + word3 |= (oob << 28); + } + Value word3Const = createI32Constant(rewriter, loc, word3); + resource = rewriter.create( + loc, llvm4xI32, resource, word3Const, + this->createIndexConstant(rewriter, loc, 3)); + args.push_back(resource); + + // Indexing (voffset) + Value voffset; + for (auto &pair : llvm::enumerate(adaptor.indices())) { + size_t i = pair.index(); + Value index = pair.value(); + Value strideOp; + if (ShapedType::isDynamicStrideOrOffset(strides[i])) { + strideOp = rewriter.create( + loc, memrefDescriptor.stride(rewriter, loc, i), byteWidthConst); + } else { + strideOp = + createI32Constant(rewriter, loc, strides[i] * elementByteWidth); + } + index = rewriter.create(loc, index, strideOp); + voffset = + voffset ? rewriter.create(loc, voffset, index) : index; + } + if (adaptor.indexOffset().hasValue()) { + int32_t indexOffset = *gpuOp.indexOffset() * elementByteWidth; + Value extraOffsetConst = createI32Constant(rewriter, loc, indexOffset); + voffset = + voffset ? rewriter.create(loc, voffset, extraOffsetConst) + : extraOffsetConst; + } + args.push_back(voffset); + + Value sgprOffset = adaptor.sgprOffset(); + if (!sgprOffset) + sgprOffset = createI32Constant(rewriter, loc, 0); + if (ShapedType::isDynamicStrideOrOffset(offset)) + sgprOffset = rewriter.create( + loc, memrefDescriptor.offset(rewriter, loc), sgprOffset); + else if (offset > 0) + sgprOffset = rewriter.create( + loc, sgprOffset, createI32Constant(rewriter, loc, offset)); + args.push_back(sgprOffset); + + // bit 0: GLC = 0 (atomics drop value, less coherency) + // bits 1-2: SLC, DLC = 0 (similarly) + // bit 3: swizzled (0 for raw) + args.push_back(createI32Constant(rewriter, loc, 0)); + + llvm::SmallVector resultTypes(gpuOp->getNumResults(), + llvmBufferValType); + Operation *lowered = rewriter.create(loc, resultTypes, args, + ArrayRef()); + if (lowered->getNumResults() == 1) { + Value replacement = lowered->getResults()[0]; + if (llvmBufferValType != llvmWantedDataType) { + replacement = rewriter.create(loc, llvmWantedDataType, + replacement); + } + rewriter.replaceOp(gpuOp, replacement); + } else { + rewriter.eraseOp(gpuOp); + } + return success(); + } +}; + +struct ConvertAMDGPUToROCDLPass + : public ConvertAMDGPUToROCDLBase { + ConvertAMDGPUToROCDLPass() = default; + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + LLVMTypeConverter converter(&getContext()); + populateAMDGPUToROCDLConversionPatterns(converter, patterns); + LLVMConversionTarget target(getContext()); + target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); + target.addLegalDialect<::mlir::ROCDL::ROCDLDialect>(); + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) + signalPassFailure(); + } +}; +} // namespace + +void mlir::populateAMDGPUToROCDLConversionPatterns( + LLVMTypeConverter &converter, RewritePatternSet &patterns) { + patterns.add< + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering>(converter); +} + +std::unique_ptr mlir::createConvertAMDGPUToROCDLPass() { + return std::make_unique(); +} diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt @@ -0,0 +1,20 @@ +add_mlir_conversion_library(MLIRAMDGPUToROCDL + AMDGPUToROCDL.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AMDGPUToROCDL + + DEPENDS + MLIRConversionPassIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRLLVMCommonConversion + MLIRLLVMIR + MLIRROCDLIR + MLIRAMDGPU + MLIRPass + MLIRTransforms + ) diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(AffineToStandard) +add_subdirectory(AMDGPUToROCDL) add_subdirectory(ArithmeticToLLVM) add_subdirectory(ArithmeticToSPIRV) add_subdirectory(ArmNeon2dToIntr) diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -18,6 +18,7 @@ #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" diff --git a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(IR) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -0,0 +1,60 @@ +//===- AMDGPUDialect.cpp - MLIR AMDGPU dialect implementation --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the AMDGPU dialect and its operations. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/TypeUtilities.h" + +using namespace mlir; + +#include "mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc" + +void amdgpu::AMDGPUDialect::initialize() { + addOperations< +#define GET_OP_LIST +#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc" + >(); +} + +//===----------------------------------------------------------------------===// +// RawBuffer*Op +//===----------------------------------------------------------------------===// +template +static LogicalResult verifyRawBufferOp(T &op) { + MemRefType bufferType = op.memref().getType().template cast(); + if (bufferType.getMemorySpaceAsInt() != 0) + return op.emitOpError( + "Buffer ops must operate on a memref in global memory"); + if (!bufferType.hasRank()) + return op.emitOpError( + "Cannot meaningfully buffer_store to an unranked memref"); + if (static_cast(op.indices().size()) != bufferType.getRank()) + return op.emitOpError("Expected " + Twine(bufferType.getRank()) + + " indices to memref"); + return success(); +} + +LogicalResult amdgpu::RawBufferLoadOp::verify() { + return verifyRawBufferOp(*this); +} + +LogicalResult amdgpu::RawBufferStoreOp::verify() { + return verifyRawBufferOp(*this); +} + +LogicalResult amdgpu::RawBufferAtomicFaddOp::verify() { + return verifyRawBufferOp(*this); +} + +#define GET_OP_CLASSES +#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc" diff --git a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt @@ -0,0 +1,13 @@ +add_mlir_dialect_library(MLIRAMDGPU + AMDGPUDialect.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU + + DEPENDS + MLIRAMDGPUIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRSideEffectInterfaces + ) diff --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt --- a/mlir/lib/Dialect/CMakeLists.txt +++ b/mlir/lib/Dialect/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(Affine) +add_subdirectory(AMDGPU) add_subdirectory(Arithmetic) add_subdirectory(ArmNeon) add_subdirectory(ArmSVE) diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp --- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp @@ -84,6 +84,76 @@ p << " " << getOperands() << " : " << vdata().getType(); } +// ::= +// `llvm.amdgcn.raw.buffer.load.* %rsrc, %offset, %soffset, %aux +// : result_type` +ParseResult RawBufferLoadOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 4) || parser.parseColonType(type) || + parser.addTypeToList(type, result.types)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + return parser.resolveOperands(ops, {i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands); +} + +void RawBufferLoadOp::print(OpAsmPrinter &p) { + p << " " << getOperands() << " : " << res().getType(); +} + +// ::= +// `llvm.amdgcn.raw.buffer.store.* %vdata, %rsrc, %offset, +// %soffset, %aux : result_type` +ParseResult RawBufferStoreOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 5) || parser.parseColonType(type)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + + if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands)) + return failure(); + return success(); +} + +void RawBufferStoreOp::print(OpAsmPrinter &p) { + p << " " << getOperands() << " : " << vdata().getType(); +} + +// ::= +// `llvm.amdgcn.raw.buffer.atomic.fadd.* %vdata, %rsrc, %offset, +// %soffset, %aux : result_type` +ParseResult RawBufferAtomicFAddOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 5) || parser.parseColonType(type)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + + if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands)) + return failure(); + return success(); +} + +void RawBufferAtomicFAddOp::print(mlir::OpAsmPrinter &p) { + p << " " << getOperands() << " : " << vdata().getType(); +} + //===----------------------------------------------------------------------===// // ROCDLDialect initialization, type parsing, and registration. //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -0,0 +1,110 @@ +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl | FileCheck %s + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32 +func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi32>, i32 -> i32 + func.return %0 : i32 +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_rdna +func.func @gpu_gcn_raw_buffer_load_i32_rdna(%buf: memref<64xi32>, %idx: i32) -> i32 { + // CHECK: %[[word3:.*]] = llvm.mlir.constant(285372416 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = true} %buf[%idx] : memref<64xi32>, i32 -> i32 + func.return %0 : i32 +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_rdna_oob_off +func.func @gpu_gcn_raw_buffer_load_i32_rdna_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 { + // CHECK: %[[word3:.*]] = llvm.mlir.constant(553807872 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = false, targetIsRDNA = true} %buf[%idx] : memref<64xi32>, i32 -> i32 + func.return %0 : i32 +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi32 +func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vector<2xi32> { + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32> + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi32>, i32 -> vector<2xi32> + func.return %0 : vector<2xi32> +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8 +func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i8 + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> i8 + func.return %0 : i8 +} +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8 +func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 + // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8> + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> vector<2xi8> + func.return %0 : vector<2xi8> +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_16xi8 +func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vector<16xi8> { + // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> + // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : vector<4xi32> to vector<16xi8> + // CHECK: return %[[ret]] + %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> vector<16xi8> + func.return %0 : vector<16xi8> +} + +// Since the lowering logic is shared with loads, only bitcasts need to be rechecked +// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32 +func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + amdgpu.raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 + func.return +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_2xi8 +func.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi8>, %idx: i32) { + // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<2xi8> to i16 + // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 + amdgpu.raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : vector<2xi8> -> memref<64xi8>, i32 + func.return +} + +// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_16xi8 +func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64xi8>, %idx: i32) { + // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32> + // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> + amdgpu.raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : vector<16xi8> -> memref<64xi8>, i32 + func.return +} + +// And more so for atomic add +// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32 +func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 + amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -0,0 +1,61 @@ +// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s +// Verify the printed output can be parsed. +// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s +// Verify the generic form can be parsed. +// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s + +// CHECK-LABEL: func @raw_buffer_load_f32_from_rank_1 +func.func @raw_buffer_load_f32_from_rank_1(%src : memref<128xf32>, %offset : i32, %idx0 : i32) -> f32 { + // CHECK: amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[{{.*}}] sgprOffset %{{.*}} : memref<128xf32>, i32 -> f32 + %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0] sgprOffset %offset : memref<128xf32>, i32 -> f32 + func.return %0 : f32 +} + +// CHECK-LABEL: func @raw_buffer_load_f32_from_rank_4 +func.func @raw_buffer_load_f32_from_rank_4(%src : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) -> f32 { + // CHECK: amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> f32 + %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> f32 + func.return %0 : f32 +} + +// CHECK-LABEL: func @raw_buffer_load_4xf32_from_rank_4 +func.func @raw_buffer_load_4xf32_from_rank_4(%src : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) -> vector<4xf32> { + // CHECK: amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32> + %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32> + func.return %0 : vector<4xf32> +} + +// CHECK-LABEL: func @raw_buffer_store_f32_to_rank_1 +func.func @raw_buffer_store_f32_to_rank_1(%value : f32, %dst : memref<128xf32>, %offset : i32, %idx0 : i32) { + // CHECK: amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128xf32>, i32 + amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0] sgprOffset %offset : f32 -> memref<128xf32>, i32 + func.return +} + +// CHECK-LABEL: func @raw_buffer_store_f32_to_rank_4 +func.func @raw_buffer_store_f32_to_rank_4(%value : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) { + // CHECK: amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + func.return +} + +// CHECK-LABEL: func @raw_buffer_store_4xf32_to_rank_4 +func.func @raw_buffer_store_4xf32_to_rank_4(%value : vector<4xf32>, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) { + // CHECK: amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : vector<4xf32> -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : vector<4xf32> -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + func.return +} + +// CHECK-LABEL: func @raw_buffer_atomic_fadd_f32_to_rank_1 +func.func @raw_buffer_atomic_fadd_f32_to_rank_1(%value : f32, %dst : memref<128xf32>, %offset : i32, %idx0 : i32) { + // CHECK: amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128xf32>, i32 + amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0] sgprOffset %offset : f32 -> memref<128xf32>, i32 + func.return +} + +// CHECK-LABEL: func @raw_buffer_atomic_fadd_f32_to_rank_4 +func.func @raw_buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) { + // CHECK: amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + func.return +} diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -167,6 +167,34 @@ llvm.return } +llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %aux : i32, %vdata1 : f32, + %vdata2 : vector<2xf32>, %vdata4 : vector<4xf32>) { + // CHECK-LABEL: rocdl.raw.buffer + // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32 + // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<2xf32> + // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<4xf32> + + // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32 + // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<2xf32> + // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<4xf32> + + // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32 + + %r1 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : f32 + %r2 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xf32> + %r4 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xf32> + + rocdl.raw.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : f32 + rocdl.raw.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xf32> + rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %offset, %aux : vector<4xf32> + + rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + + llvm.return +} + // ----- // expected-error@below {{attribute attached to unexpected op}} diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -169,21 +169,61 @@ %slc = llvm.mlir.constant(true) : i1 // CHECK-LABEL: rocdl.mubuf // CHECK: call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - %r1 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> // CHECK: call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - %r2 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> // CHECK: call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - %r4 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32> // CHECK: call void @llvm.amdgcn.buffer.store.v1f32(<1 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - rocdl.buffer.store %vdata1, %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> // CHECK: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> // CHECK: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) + + %r1 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> + %r2 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> + %r4 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32> + + rocdl.buffer.store %vdata1, %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> + rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> rocdl.buffer.store %vdata4, %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32> llvm.return } +llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %vdata1 : i32, + %vdata2 : vector<2xi32>, + %vdata4 : vector<4xi32>) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.buffer + // CHECK: call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + // CHECK: call void @llvm.amdgcn.raw.buffer.store.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + %r1 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : i32 + %r2 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xi32> + %r4 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xi32> + + rocdl.raw.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : i32 + rocdl.raw.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xi32> + rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %soffset, %aux : vector<4xi32> + + llvm.return +} + +llvm.func @rocdl.raw.buffer.atomic(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %vdata1 : f32) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.buffer.atomic + // CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + + llvm.return +} + // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024" diff --git a/mlir/test/mlir-opt/commandline.mlir b/mlir/test/mlir-opt/commandline.mlir --- a/mlir/test/mlir-opt/commandline.mlir +++ b/mlir/test/mlir-opt/commandline.mlir @@ -2,6 +2,7 @@ // CHECK: Available Dialects: // CHECK-NEXT: acc // CHECK-NEXT: affine +// CHECK-NEXT: amdgpu // CHECK-NEXT: amx // CHECK-NEXT: arith // CHECK-NEXT: arm_neon diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -1148,6 +1148,69 @@ deps = [":AffineOpsTdFiles"], ) +##---------------------------------------------------------------------------## +# AMDGPU dialect. +##---------------------------------------------------------------------------## + +td_library( + name = "AMDGPUTdFiles", + srcs = ["include/mlir/Dialect/AMDGPU/AMDGPU.td"], + includes = ["include"], + deps = [ + ":SideEffectInterfacesTdFiles", + ], +) + +gentbl_cc_library( + name = "AMDGPUIncGen", + strip_include_prefix = "include", + tbl_outs = [ + ( + [ + "-gen-dialect-decls", + "-dialect=AMDGPU", + ], + "include/mlir/Dialect/AMDGPU/AMDGPUDialect.h.inc", + ), + ( + [ + "-gen-dialect-defs", + "-dialect=AMDGPU", + ], + "include/mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc", + ), + ( + ["-gen-op-decls"], + "include/mlir/Dialect/AMDGPU/AMDGPU.h.inc", + ), + ( + ["-gen-op-defs"], + "include/mlir/Dialect/AMDGPU/AMDGPU.cpp.inc", + ), + ( + ["-gen-op-doc"], + "g3doc/Dialects/AMDGPU/AMDGPU.md", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/AMDGPU/AMDGPU.td", + deps = [":AMDGPUTdFiles"], +) + +cc_library( + name = "AMDGPU", + srcs = ["lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp"], + hdrs = ["include/mlir/Dialect/AMDGPU/AMDGPUDialect.h"], + includes = ["include"], + deps = [ + ":IR", + ":AMDGPUIncGen", + ":SideEffectInterfaces", + "//llvm:Core", + "//llvm:Support", + ], +) + ##---------------------------------------------------------------------------## # EmitC dialect. ##---------------------------------------------------------------------------## @@ -2384,6 +2447,7 @@ includes = ["include"], deps = [ ":AffineToStandard", + ":AMDGPUToROCDL", ":ArithmeticToLLVM", ":ArithmeticToSPIRV", ":ArmNeon2dToIntr", @@ -3616,22 +3680,24 @@ ], ) + + cc_library( - name = "NVGPUToNVVM", + name = "AMDGPUToROCDL", srcs = glob([ - "lib/Conversion/NVGPUToNVVM/*.cpp", - "lib/Conversion/NVGPUToNVVM/*.h", + "lib/Conversion/AMDGPUToROCDL/*.cpp", + "lib/Conversion/AMDGPUToROCDL/*.h", ]) + [":ConversionPassDetail"], hdrs = glob([ - "include/mlir/Conversion/NVGPUToNVVM/*.h", + "include/mlir/Conversion/AMDGPUToROCDL/*.h", ]), includes = ["include"], deps = [ ":ConversionPassIncGen", ":IR", ":LLVMCommonConversion", - ":NVGPU", - ":NVVMDialect", + ":AMDGPU", + ":AMDGPUDialect", ":Pass", ":Transforms", "//llvm:Support", @@ -6049,6 +6115,8 @@ ":AffinePassIncGen", ":AffineToStandard", ":AffineTransforms", + ":AMDGPU", + ":AMDGPUToROCDL", ":ArithmeticDialect", ":ArithmeticToLLVM", ":ArithmeticToSPIRV",