Diff 476606

mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td

Show All 17 Lines	def AMDGPU_Dialect : Dialect {
let cppNamespace = "::mlir::amdgpu";		let cppNamespace = "::mlir::amdgpu";
let description = [{		let description = [{
The `AMDGPU` dialect provides wrappers around AMD-specific functionality		The `AMDGPU` dialect provides wrappers around AMD-specific functionality
and LLVM intrinsics. These wrappers should be used in conjunction with		and LLVM intrinsics. These wrappers should be used in conjunction with
more generic dialects, such as `gpu` and `vector`, when generating LLVM IR		more generic dialects, such as `gpu` and `vector`, when generating LLVM IR
that will eventually be executed on AMD hardware.		that will eventually be executed on AMD hardware.
}];		}];


		let dependentDialects = [
		"arith::ArithDialect"
		];
let useDefaultAttributePrinterParser = 1;		let useDefaultAttributePrinterParser = 1;
		zero9178Unsubmitted Done Reply Inline Actions I believe you must also add the arithmetic dialect as a dependent dialect here, since the canonicalization now depends on it being loaded zero9178: I believe you must also add the arithmetic dialect as a dependent dialect here, since the…
		krzysz00AuthorUnsubmitted Done Reply Inline Actions Good tacch, thanks! krzysz00: Good tacch, thanks!
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// AMDGPU Op definitions		// AMDGPU Op definitions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class AMDGPU_Op<string mnemonic, list<Trait> traits = []> :		class AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
Op<AMDGPU_Dialect, mnemonic, traits> {}		Op<AMDGPU_Dialect, mnemonic, traits> {}
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	- If `boundsCheck` is false and the target chipset is RDNA, OOB_SELECT is set
to 2 to disable bounds checks, otherwise it is 3		to 2 to disable bounds checks, otherwise it is 3
- The cache coherency bits are off		- The cache coherency bits are off
}];		}];
let assemblyFormat = [{		let assemblyFormat = [{
attr-dict $memref `[` $indices `]`		attr-dict $memref `[` $indices `]`
(`sgprOffset` $sgprOffset^)? `:`		(`sgprOffset` $sgprOffset^)? `:`
type($memref) `,` type($indices) `->` type($value)		type($memref) `,` type($indices) `->` type($value)
}];		}];
		let hasCanonicalizer = 1;
let hasVerifier = 1;		let hasVerifier = 1;
}		}

/// Raw buffer store		/// Raw buffer store
def AMDGPU_RawBufferStoreOp :		def AMDGPU_RawBufferStoreOp :
AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>,		AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>,
AttrSizedOperandSegments]>,		AttrSizedOperandSegments]>,
Arguments<(ins AnyTypeOf<[BF16, F16, F32, I32, I8,		Arguments<(ins AnyTypeOf<[BF16, F16, F32, I32, I8,
Show All 25 Lines	let description = [{
See `amdgpu.raw_buffer_load` for a description of how the underlying		See `amdgpu.raw_buffer_load` for a description of how the underlying
instruction is constructed.		instruction is constructed.
}];		}];
let assemblyFormat = [{		let assemblyFormat = [{
attr-dict $value `->` $memref `[` $indices `]`		attr-dict $value `->` $memref `[` $indices `]`
(`sgprOffset` $sgprOffset^)? `:`		(`sgprOffset` $sgprOffset^)? `:`
type($value) `->` type($memref) `,` type($indices)		type($value) `->` type($memref) `,` type($indices)
}];		}];
		let hasCanonicalizer = 1;
let hasVerifier = 1;		let hasVerifier = 1;
}		}

// Raw buffer atomic floating point add		// Raw buffer atomic floating point add
def AMDGPU_RawBufferAtomicFaddOp :		def AMDGPU_RawBufferAtomicFaddOp :
AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,		AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,
AttrSizedOperandSegments]>,		AttrSizedOperandSegments]>,
Arguments<(ins F32:$value,		Arguments<(ins F32:$value,
Show All 22 Lines	let description = [{
See `amdgpu.raw_buffer_load` for a description of how the underlying		See `amdgpu.raw_buffer_load` for a description of how the underlying
instruction is constructed.		instruction is constructed.
}];		}];
let assemblyFormat = [{		let assemblyFormat = [{
attr-dict $value `->` $memref `[` $indices `]`		attr-dict $value `->` $memref `[` $indices `]`
(`sgprOffset` $sgprOffset^)? `:`		(`sgprOffset` $sgprOffset^)? `:`
type($value) `->` type($memref) `,` type($indices)		type($value) `->` type($memref) `,` type($indices)
}];		}];
		let hasCanonicalizer = 1;
let hasVerifier = 1;		let hasVerifier = 1;
}		}

def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {		def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
let summary = "Barrier that includes a wait for LDS memory operations.";		let summary = "Barrier that includes a wait for LDS memory operations.";
let description = [{		let description = [{
`amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach		`amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach
the barrier before any of them may proceed past it) and a wait for all		the barrier before any of them may proceed past it) and a wait for all
▲ Show 20 Lines • Show All 102 Lines • Show Last 20 Lines

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

	//===- AMDGPUDialect.cpp - MLIR AMDGPU dialect implementation --------===//			//===- AMDGPUDialect.cpp - MLIR AMDGPU dialect implementation --------===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This file implements the AMDGPU dialect and its operations.			// This file implements the AMDGPU dialect and its operations.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"			#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"

				#include "mlir/Dialect/Arith/IR/Arith.h"
	#include "mlir/IR/Builders.h"			#include "mlir/IR/Builders.h"
	#include "mlir/IR/BuiltinTypes.h"			#include "mlir/IR/BuiltinTypes.h"
	#include "mlir/IR/Diagnostics.h"			#include "mlir/IR/Diagnostics.h"
	#include "mlir/IR/DialectImplementation.h"			#include "mlir/IR/DialectImplementation.h"
				#include "mlir/IR/Matchers.h"
	#include "mlir/IR/OpImplementation.h"			#include "mlir/IR/OpImplementation.h"
				#include "mlir/IR/PatternMatch.h"
	#include "mlir/IR/TypeUtilities.h"			#include "mlir/IR/TypeUtilities.h"
	#include "llvm/ADT/TypeSwitch.h"			#include "llvm/ADT/TypeSwitch.h"

				#include <limits>

	using namespace mlir;			using namespace mlir;
	using namespace mlir::amdgpu;			using namespace mlir::amdgpu;

	#include "mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc"			#include "mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc"

	void AMDGPUDialect::initialize() {			void AMDGPUDialect::initialize() {
	addOperations<			addOperations<
	#define GET_OP_LIST			#define GET_OP_LIST
	Show All 26 Lines
	LogicalResult RawBufferLoadOp::verify() { return verifyRawBufferOp(*this); }			LogicalResult RawBufferLoadOp::verify() { return verifyRawBufferOp(*this); }

	LogicalResult RawBufferStoreOp::verify() { return verifyRawBufferOp(*this); }			LogicalResult RawBufferStoreOp::verify() { return verifyRawBufferOp(*this); }

	LogicalResult RawBufferAtomicFaddOp::verify() {			LogicalResult RawBufferAtomicFaddOp::verify() {
	return verifyRawBufferOp(*this);			return verifyRawBufferOp(*this);
	}			}

				static Optional<uint32_t> getConstantUint32(Value v) {
				APInt cst;
				if (!v.getType().isInteger(32))
				return None;
				if (matchPattern(v, m_ConstantInt(&cst)))
				return cst.getZExtValue();
				return None;
				}

				template <typename OpType>
				static bool staticallyOutOfBounds(OpType op) {
				nirvedhmeshramUnsubmitted Not Done Reply Inline Actions nit: maybe this function can just return a bool so you dont have to use it with `succeeded`? nirvedhmeshram: nit: maybe this function can just return a bool so you dont have to use it with `succeeded`?
				if (!op.getBoundsCheck())
				return false;
				MemRefType bufferType = op.getMemref().getType();
				if (!bufferType.hasStaticShape())
				return false;
				int64_t offset;
				SmallVector<int64_t> strides;
				if (failed(getStridesAndOffset(bufferType, strides, offset)))
				return false;
				int64_t result = offset + op.getIndexOffset().value_or(0);
				if (op.getSgprOffset()) {
				Optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset());
				if (!sgprOffset)
				return false;
				result += *sgprOffset;
				}
				if (strides.size() != op.getIndices().size())
				return false;
				int64_t indexVal = 0;
				for (auto pair : llvm::zip(strides, op.getIndices())) {
				int64_t stride = std::get<0>(pair);
				Value idx = std::get<1>(pair);
				Optional<uint32_t> idxVal = getConstantUint32(idx);
				if (!idxVal)
				return false;
				indexVal += stride * idxVal.value();
				}
				result += indexVal;
				if (result > std::numeric_limits<uint32_t>::max())
				// Overflow means don't drop
				return false;
				return result >= bufferType.getNumElements();
				}

				namespace {
				struct RemoveStaticallyOobBufferLoads final
				: public OpRewritePattern<RawBufferLoadOp> {
				using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;

				LogicalResult matchAndRewrite(RawBufferLoadOp op,
				PatternRewriter &rw) const override {
				if (!staticallyOutOfBounds(op))
				nirvedhmeshramUnsubmitted Not Done Reply Inline Actions Since llvm prefers early exits https://llvm.org/docs/CodingStandards.html#use-early-exits-and-continue-to-simplify-code maybe refactor to if(!staticallyOutOfBounds(op)) return failure(); ... return success(); nirvedhmeshram: Since llvm prefers early exits https://llvm.org/docs/CodingStandards.html#use-early-exits-and…
				return failure();
				Type loadType = op.getResult().getType();
				rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType,
				rw.getZeroAttr(loadType));
				return success();
				}
				};

				template <typename OpType>
				struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
				using OpRewritePattern<OpType>::OpRewritePattern;

				LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
				if (!staticallyOutOfBounds(op))
				return failure();

				rw.eraseOp(op);
				return success();
				}
				};
				} // end namespace

				void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
				MLIRContext *context) {
				results.add<RemoveStaticallyOobBufferLoads>(context);
				}

				void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
				MLIRContext *context) {
				results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context);
				}

				void RawBufferAtomicFaddOp::getCanonicalizationPatterns(
				RewritePatternSet &results, MLIRContext *context) {
				results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context);
				}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// MFMAOp			// MFMAOp
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	LogicalResult MFMAOp::verify() {			LogicalResult MFMAOp::verify() {
	constexpr uint32_t waveSize = 64;			constexpr uint32_t waveSize = 64;
	Builder b(getContext());			Builder b(getContext());

	Type sourceType = getSourceA().getType();			Type sourceType = getSourceA().getType();
	▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt

	add_mlir_dialect_library(MLIRAMDGPUDialect			add_mlir_dialect_library(MLIRAMDGPUDialect
	AMDGPUDialect.cpp			AMDGPUDialect.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU			${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU

	DEPENDS			DEPENDS
	MLIRAMDGPUEnumsGen			MLIRAMDGPUEnumsGen
	MLIRAMDGPUAttributesIncGen			MLIRAMDGPUAttributesIncGen
	MLIRAMDGPUIncGen			MLIRAMDGPUIncGen

	LINK_LIBS PUBLIC			LINK_LIBS PUBLIC
				MLIRArithDialect
	MLIRIR			MLIRIR
	MLIRSideEffectInterfaces			MLIRSideEffectInterfaces
	)			)

mlir/test/Dialect/AMDGPU/canonicalize.mlir

This file was added.

				// RUN: mlir-opt %s -split-input-file -canonicalize \| FileCheck %s

				// CHECK-LABEL: func @known_oob_load
				func.func @known_oob_load(%arg0: memref<4xf32>) -> f32 {
				// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
				// CHECK: return %[[zero]]
				%c4_i32 = arith.constant 4 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @known_oob_load_2d
				func.func @known_oob_load_2d(%arg0: memref<4x4xf32>) -> f32 {
				// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
				// CHECK: return %[[zero]]
				%c0_i32 = arith.constant 0 : i32
				%c4_i32 = arith.constant 4 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32, %c0_i32] : memref<4x4xf32>, i32, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @known_oob_load_2d_on_last
				func.func @known_oob_load_2d_on_last(%arg0: memref<4x4xf32>) -> f32 {
				// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
				// CHECK: return %[[zero]]
				%c0_i32 = arith.constant 0 : i32
				%c16_i32 = arith.constant 16 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c16_i32] : memref<4x4xf32>, i32, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @known_oob_load_index
				func.func @known_oob_load_index(%arg0: memref<4xf32>) -> f32 {
				// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
				// CHECK: return %[[zero]]
				%c0_i32 = arith.constant 0 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 4 : i32} %arg0[%c0_i32] : memref<4xf32>, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @known_oob_load_sgproffset
				func.func @known_oob_load_sgproffset(%arg0: memref<4xf32>) -> f32 {
				// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
				// CHECK: return %[[zero]]
				%c2_i32 = arith.constant 2 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c2_i32] sgprOffset %c2_i32 : memref<4xf32>, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @unknown_load
				func.func @unknown_load(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
				// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
				// CHECK: return %[[loaded]]
				%c4_i32 = arith.constant 4 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%arg1] sgprOffset %c4_i32 : memref<4xf32>, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @unknown_load_sgproffset
				func.func @unknown_load_sgproffset(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
				// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
				// CHECK: return %[[loaded]]
				%c4_i32 = arith.constant 4 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] sgprOffset %arg1 : memref<4xf32>, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @unranked
				func.func @unranked(%arg0: memref<?xf32>) -> f32 {
				// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
				// CHECK: return %[[loaded]]
				%c4_i32 = arith.constant 4 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<?xf32>, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @no_oob_check
				func.func @no_oob_check(%arg0: memref<4xf32>) -> f32 {
				// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
				// CHECK: return %[[loaded]]
				%c4_i32 = arith.constant 4 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = false} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @in_bounds_overall
				func.func @in_bounds_overall(%arg0: memref<4x4xf32>) -> f32 {
				// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
				// CHECK: return %[[loaded]]
				%c0_i32 = arith.constant 0 : i32
				%c15_i32 = arith.constant 15 : i32
				%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c15_i32] : memref<4x4xf32>, i32, i32 -> f32
				func.return %0 : f32
				}

				// -----

				// CHECK-LABEL: func @dead_store
				func.func @dead_store(%arg0: memref<4xf32>, %arg1: f32) {
				// CHECK-NOT: amdgpu.raw_buffer_store
				%c4_i32 = arith.constant 4 : i32
				amdgpu.raw_buffer_store {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
				func.return
				}

				// -----

				// CHECK-LABEL: func @dead_atomic_add
				func.func @dead_atomic_add(%arg0: memref<4xf32>, %arg1: f32) {
				// CHECK-NOT: amdgpu.raw_buffer_atomic_fadd
				%c4_i32 = arith.constant 4 : i32
				amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
				func.return
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][AMDGPU] Remove buffer ops that are statically out of bounds
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 476606

mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt

mlir/test/Dialect/AMDGPU/canonicalize.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][AMDGPU] Remove buffer ops that are statically out of boundsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 476606

mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt

mlir/test/Dialect/AMDGPU/canonicalize.mlir

[mlir][AMDGPU] Remove buffer ops that are statically out of bounds
ClosedPublic