diff --git a/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h --- a/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h +++ b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h @@ -8,7 +8,7 @@ #ifndef MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_ #define MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_ -#include "mlir/Conversion/AMDGPUToROCDL/Chipset.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include #include diff --git a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt @@ -1,12 +1,2 @@ -add_mlir_dialect(AMDGPU amdgpu) -add_mlir_doc(AMDGPU AMDGPU Dialects/ -gen-dialect-doc) - -set(LLVM_TARGET_DEFINITIONS AMDGPU.td) -mlir_tablegen(AMDGPUEnums.h.inc -gen-enum-decls) -mlir_tablegen(AMDGPUEnums.cpp.inc -gen-enum-defs) -add_public_tablegen_target(MLIRAMDGPUEnumsGen) - -set(LLVM_TARGET_DEFINITIONS AMDGPU.td) -mlir_tablegen(AMDGPUAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=amdgpu) -mlir_tablegen(AMDGPUAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=amdgpu) -add_public_tablegen_target(MLIRAMDGPUAttributesIncGen) +add_subdirectory(IR) +add_subdirectory(Transforms) diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td rename from mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td rename to mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td --- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -221,7 +221,7 @@ def AMDGPU_RawBufferAtomicFmaxOp : AMDGPU_Op<"raw_buffer_atomic_fmax", [AllElementTypesMatch<["value", "memref"]>, AttrSizedOperandSegments]>, - Arguments<(ins F32:$value, + Arguments<(ins AnyTypeOf<[F32, F64]>:$value, Arg:$memref, Variadic:$indices, DefaultValuedAttr:$boundsCheck, diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h rename from mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h rename to mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h --- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h @@ -11,22 +11,22 @@ // //===----------------------------------------------------------------------===// -#ifndef MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_ -#define MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_ +#ifndef MLIR_DIALECT_AMDGPU_IR_AMDGPUDIALECT_H_ +#define MLIR_DIALECT_AMDGPU_IR_AMDGPUDIALECT_H_ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h.inc" -#include "mlir/Dialect/AMDGPU/AMDGPUEnums.h.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc" #define GET_ATTRDEF_CLASSES -#include "mlir/Dialect/AMDGPU/AMDGPUAttributes.h.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc" #define GET_OP_CLASSES -#include "mlir/Dialect/AMDGPU/AMDGPU.h.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPU.h.inc" -#endif // MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_ +#endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUDIALECT_H_ diff --git a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/IR/CMakeLists.txt copy from mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt copy to mlir/include/mlir/Dialect/AMDGPU/IR/CMakeLists.txt diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/Transforms/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/CMakeLists.txt @@ -0,0 +1,6 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name AMDGPU) +add_public_tablegen_target(MLIRAMDGPUTransformsIncGen) +add_dependencies(mlir-headers MLIRAMDGPUTransformsIncGen) + +add_mlir_doc(Passes AMDGPUPasses ./ -gen-pass-doc) diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h @@ -0,0 +1,33 @@ +//===-- Passes.h - AMDGPU transformation pass declarations --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the transformation passes for the TOSA Dialect in MLIR. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_H_ +#define MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_H_ + +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" +#include "mlir/Pass/Pass.h" + +namespace mlir { +class ConversionTarget; +namespace amdgpu { + +#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS +#define GEN_PASS_REGISTRATION +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" + +void populateAmdgpuEmulateAtomicsPatterns(ConversionTarget &target, + RewritePatternSet &patterns, + Chipset chipset); +} // namespace amdgpu +} // namespace mlir + +#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_H_ diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td @@ -0,0 +1,33 @@ +//===-- Passes.td - AMDGPU pass declarations ----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the passes for the AMDGPU Dialect in MLIR. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_ +#define MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_ + +include "mlir/Pass/PassBase.td" + +def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> { + let summary = "Emulate atomic operations on chipsets that do not support them"; + let description = [{ + This pass rewrites any AMDGPU-specific atomic operation that is not supported + on the given `chipset` into a compare-and-swap loop. + }]; + let dependentDialects = [ + "cf::ControlFlowDialect", + "arith::ArithDialect", + ]; + let options = [Option<"chipset", "chipset", "std::string", + /*default=*/"\"gfx000\"", + "Chipset that these operations will run on">]; +} + +#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_ diff --git a/mlir/include/mlir/Conversion/AMDGPUToROCDL/Chipset.h b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h rename from mlir/include/mlir/Conversion/AMDGPUToROCDL/Chipset.h rename to mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h --- a/mlir/include/mlir/Conversion/AMDGPUToROCDL/Chipset.h +++ b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h @@ -5,8 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#ifndef MLIR_CONVERSION_AMDGPUTOROCDL_CHIPSET_H_ -#define MLIR_CONVERSION_AMDGPUTOROCDL_CHIPSET_H_ +#ifndef MLIR_DIALECT_AMDGPU_UTILS_CHIPSET_H_ +#define MLIR_DIALECT_AMDGPU_UTILS_CHIPSET_H_ #include "mlir/Support/LogicalResult.h" diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -14,7 +14,7 @@ #ifndef MLIR_INITALLDIALECTS_H_ #define MLIR_INITALLDIALECTS_H_ -#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/AMX/AMXDialect.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h" diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -15,6 +15,7 @@ #define MLIR_INITALLPASSES_H_ #include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Arith/Transforms/Passes.h" #include "mlir/Dialect/Async/Passes.h" @@ -56,6 +57,7 @@ // Dialect passes registerAffinePasses(); + amdgpu::registerAMDGPUPasses(); registerAsyncPasses(); arith::registerArithPasses(); bufferization::registerBufferizationPasses(); diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -10,7 +10,7 @@ #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" -#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Pass/Pass.h" diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt --- a/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt +++ b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt @@ -1,6 +1,5 @@ add_mlir_conversion_library(MLIRAMDGPUToROCDL AMDGPUToROCDL.cpp - Chipset.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AMDGPUToROCDL @@ -16,6 +15,7 @@ MLIRLLVMDialect MLIRROCDLDialect MLIRAMDGPUDialect + MLIRAMDGPUUtils MLIRPass MLIRTransforms ) diff --git a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt --- a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt +++ b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt @@ -1 +1,3 @@ add_subdirectory(IR) +add_subdirectory(Transforms) +add_subdirectory(Utils) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" @@ -30,16 +30,16 @@ using namespace mlir; using namespace mlir::amdgpu; -#include "mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc" void AMDGPUDialect::initialize() { addOperations< #define GET_OP_LIST -#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc" >(); addAttributes< #define GET_ATTRDEF_LIST -#include "mlir/Dialect/AMDGPU/AMDGPUAttributes.cpp.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc" >(); } @@ -282,10 +282,10 @@ return success(); } -#include "mlir/Dialect/AMDGPU/AMDGPUEnums.cpp.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc" #define GET_ATTRDEF_CLASSES -#include "mlir/Dialect/AMDGPU/AMDGPUAttributes.cpp.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc" #define GET_OP_CLASSES -#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc" diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt @@ -0,0 +1,19 @@ +add_mlir_dialect_library(MLIRAMDGPUTransforms + EmulateAtomics.cpp + + ADDITIONAL_HEADER_DIRS + {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms + + DEPENDS + MLIRAMDGPUTransformsIncGen + + LINK_LIBS PUBLIC + MLIRAMDGPUDialect + MLIRAMDGPUUtils + MLIRArithDialect + MLIRControlFlowDialect + MLIRIR + MLIRPass + MLIRTransforms + MLIRTransformUtils + ) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -0,0 +1,189 @@ +//===- EmulateAtomics.cpp - Emulate unsupported AMDGPU atomics ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h" + +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir::amdgpu { +#define GEN_PASS_DEF_AMDGPUEMULATEATOMICSPASS +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" +} // namespace mlir::amdgpu + +using namespace mlir; +using namespace mlir::amdgpu; + +namespace { +struct AmdgpuEmulateAtomicsPass + : public amdgpu::impl::AmdgpuEmulateAtomicsPassBase< + AmdgpuEmulateAtomicsPass> { + using AmdgpuEmulateAtomicsPassBase< + AmdgpuEmulateAtomicsPass>::AmdgpuEmulateAtomicsPassBase; + void runOnOperation() override; +}; + +template +struct RawBufferAtomicByCasPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + using Adaptor = typename AtomicOp::Adaptor; + + LogicalResult + matchAndRewrite(AtomicOp atomicOp, Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; +} // namespace + +namespace { +enum class DataArgAction : unsigned char { + Duplicate, + Drop, +}; +} // namespace + +// Fix up the fact that, when we're migrating from a general bugffer atomic +// to a load or to a CAS, the number of openrands, and thus the number of +// entries needed in operand_segment_sizes, needs to change. We use this method +// because we'd like to preserve unknown attributes on the atomic instead of +// discarding them. +static void patchOperandSegmentSizes(ArrayRef attrs, + SmallVectorImpl &newAttrs, + DataArgAction action) { + newAttrs.reserve(attrs.size()); + for (NamedAttribute attr : attrs) { + if (attr.getName().getValue() != "operand_segment_sizes") { + newAttrs.push_back(attr); + continue; + } + auto segmentAttr = attr.getValue().cast(); + MLIRContext *context = segmentAttr.getContext(); + DenseI32ArrayAttr newSegments; + switch (action) { + case DataArgAction::Drop: + newSegments = DenseI32ArrayAttr::get( + context, segmentAttr.asArrayRef().drop_front()); + break; + case DataArgAction::Duplicate: { + SmallVector newVals; + ArrayRef oldVals = segmentAttr.asArrayRef(); + newVals.push_back(oldVals[0]); + newVals.append(oldVals.begin(), oldVals.end()); + newSegments = DenseI32ArrayAttr::get(context, newVals); + break; + } + } + newAttrs.push_back(NamedAttribute(attr.getName(), newSegments)); + } +} + +template +LogicalResult RawBufferAtomicByCasPattern::matchAndRewrite( + AtomicOp atomicOp, Adaptor adaptor, + ConversionPatternRewriter &rewriter) const { + Location loc = atomicOp.getLoc(); + + ArrayRef origAttrs = atomicOp->getAttrs(); + ValueRange operands = adaptor.getOperands(); + Value data = operands.take_front()[0]; + ValueRange invariantArgs = operands.drop_front(); + Type dataType = data.getType(); + + SmallVector loadAttrs; + patchOperandSegmentSizes(origAttrs, loadAttrs, DataArgAction::Drop); + Value initialLoad = + rewriter.create(loc, dataType, invariantArgs, loadAttrs); + Block *currentBlock = rewriter.getInsertionBlock(); + Block *afterAtomic = + rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint()); + Block *loopBlock = rewriter.createBlock(afterAtomic, {dataType}, {loc}); + + rewriter.setInsertionPointToEnd(currentBlock); + rewriter.create(loc, loopBlock, initialLoad); + + rewriter.setInsertionPointToEnd(loopBlock); + Value prevLoad = loopBlock->getArgument(0); + Value operated = rewriter.create(loc, data, prevLoad); + + SmallVector cmpswapAttrs; + patchOperandSegmentSizes(origAttrs, cmpswapAttrs, DataArgAction::Duplicate); + SmallVector cmpswapArgs = {operated, prevLoad}; + cmpswapArgs.append(invariantArgs.begin(), invariantArgs.end()); + Value atomicRes = rewriter.create( + loc, dataType, cmpswapArgs, cmpswapAttrs); + + // We care about exact bitwise equality here, so do some bitcasts. + // These will fold away during lowering to the ROCDL dialect, where + // an int->float bitcast is introduced to account for the fact that cmpswap + // only takes integer arguments. + + Value prevLoadForCompare = prevLoad; + Value atomicResForCompare = atomicRes; + if (auto floatDataTy = dataType.dyn_cast()) { + Type equivInt = rewriter.getIntegerType(floatDataTy.getWidth()); + prevLoadForCompare = + rewriter.create(loc, equivInt, prevLoad); + atomicResForCompare = + rewriter.create(loc, equivInt, atomicRes); + } + Value canLeave = rewriter.create( + loc, arith::CmpIPredicate::eq, atomicResForCompare, prevLoadForCompare); + rewriter.create(loc, canLeave, afterAtomic, ValueRange{}, + loopBlock, atomicRes); + rewriter.replaceOp(atomicOp, {}); + return success(); +} + +void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( + ConversionTarget &target, RewritePatternSet &patterns, Chipset chipset) { + // gfx10 has no atomic adds. + if (chipset.majorVersion == 10 || chipset.majorVersion < 9 || + (chipset.majorVersion == 9 && chipset.minorVersion < 0x08)) { + target.addIllegalOp(); + } + // gfx9 has no to a very limited support for floating-point min and max. + if (chipset.majorVersion == 9) { + if (chipset.minorVersion >= 0x0a) { + // gfx90a supports f64 max (and min, but we don't have a min wrapper right + // now) but all other types need to be emulated. + target.addDynamicallyLegalOp( + [](RawBufferAtomicFmaxOp op) -> bool { + return op.getValue().getType().isF64(); + }); + } else { + target.addIllegalOp(); + } + } + patterns + .add, + RawBufferAtomicByCasPattern>( + patterns.getContext()); +} + +void AmdgpuEmulateAtomicsPass::runOnOperation() { + Operation *op = getOperation(); + FailureOr maybeChipset = Chipset::parse(chipset); + if (failed(maybeChipset)) { + emitError(op->getLoc(), "Invalid chipset name: " + chipset); + return signalPassFailure(); + } + + MLIRContext &ctx = getContext(); + ConversionTarget target(ctx); + RewritePatternSet patterns(&ctx); + target.markUnknownOpDynamicallyLegal( + [](Operation *op) -> bool { return true; }); + + populateAmdgpuEmulateAtomicsPatterns(target, patterns, *maybeChipset); + if (failed(applyPartialConversion(op, target, std::move(patterns)))) + return signalPassFailure(); +} diff --git a/mlir/lib/Dialect/AMDGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Utils/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/AMDGPU/Utils/CMakeLists.txt @@ -0,0 +1,10 @@ +add_mlir_dialect_library(MLIRAMDGPUUtils + Chipset.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Utils + + LINK_LIBS PUBLIC + MLIRAMDGPUDialect + MLIRSupport + ) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/Chipset.cpp b/mlir/lib/Dialect/AMDGPU/Utils/Chipset.cpp rename from mlir/lib/Conversion/AMDGPUToROCDL/Chipset.cpp rename to mlir/lib/Dialect/AMDGPU/Utils/Chipset.cpp --- a/mlir/lib/Conversion/AMDGPUToROCDL/Chipset.cpp +++ b/mlir/lib/Dialect/AMDGPU/Utils/Chipset.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Conversion/AMDGPUToROCDL/Chipset.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/StringRef.h" diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir @@ -0,0 +1,52 @@ +// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX9 +// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1030 %s | FileCheck %s --check-prefixes=CHECK,GFX10 + +// ----- + +func.func @atomic_fmax(%val: f32, %buffer: memref, %idx: i32) { +// CHECK: func @atomic_fmax +// CHECK-SAME: ([[val:%.+]]: f32, [[buffer:%.+]]: memref, [[idx:%.+]]: i32) +// CHECK: gpu.printf "Begin\0A" +// GFX10: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]] +// GFX9: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]] +// GFX9: cf.br [[loop:\^.+]]([[ld]] : f32) +// GFX9: [[loop]]([[arg:%.+]]: f32): +// GFX9: [[operated:%.+]] = arith.maxf [[val]], [[arg]] +// GFX9: [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]] +// GFX9: [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32 +// GFX9: [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32 +// GFX9: [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]] +// GFX9: cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32) +// GFX9: [[post]]: +// CHECK-NEXT: gpu.printf "End\0A" + gpu.printf "Begin\n" + amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} %val -> %buffer[%idx] : f32 -> memref, i32 + gpu.printf "End\n" + func.return +} + +// ----- + +func.func @atomic_fmax_f64(%val: f64, %buffer: memref, %idx: i32) { +// CHECK: func @atomic_fmax_f64 +// CHECK-SAME: ([[val:%.+]]: f64, [[buffer:%.+]]: memref, [[idx:%.+]]: i32) +// CHECK: gpu.printf "Begin\0A" +// GFX9: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] +// GFX10: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] +// CHECK-NEXT: gpu.printf "End\0A" + gpu.printf "Begin\n" + amdgpu.raw_buffer_atomic_fmax %val -> %buffer[%idx] : f64 -> memref, i32 + gpu.printf "End\n" + func.return +} + +// ----- + +func.func @atomic_fadd(%val: f32, %buffer: memref, %idx: i32) { +// CHECK: func @atomic_fadd +// GFX9: amdgpu.raw_buffer_atomic_fadd +// GFX10: amdgpu.raw_buffer_load +// GFX10: amdgpu.raw_buffer_atomic_cmpswap + amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : f32 -> memref, i32 + func.return +}