Diff 513352

mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp

//===- FoldMemRefAliasOps.cpp - Fold memref alias ops -----===//		//===- FoldMemRefAliasOps.cpp - Fold memref alias ops -----===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This transformation pass folds loading/storing from/to subview ops into		// This transformation pass folds loading/storing from/to subview ops into
// loading/storing from/to the original memref.		// loading/storing from/to the original memref.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

ThomasRaouxUnsubmitted Not Done Reply Inline Actions nit: I would leave this empty line ThomasRaoux: nit: I would leave this empty line
#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"		#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"		#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"		#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"		#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"		#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
		#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"		#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"		#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/AffineMap.h"		#include "mlir/IR/AffineMap.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallBitVector.h"		#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/TypeSwitch.h"		#include "llvm/ADT/TypeSwitch.h"
		#include "llvm/Support/Debug.h"

		#define DEBUG_TYPE "fold-memref-alias-ops"
		#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

namespace mlir {		namespace mlir {
namespace memref {		namespace memref {
#define GEN_PASS_DEF_FOLDMEMREFALIASOPS		#define GEN_PASS_DEF_FOLDMEMREFALIASOPS
#include "mlir/Dialect/MemRef/Transforms/Passes.h.inc"		#include "mlir/Dialect/MemRef/Transforms/Passes.h.inc"
} // namespace memref		} // namespace memref
} // namespace mlir		} // namespace mlir

▲ Show 20 Lines • Show All 251 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(memref::SubViewOp subView,

// Replace original op.		// Replace original op.
rewriter.replaceOpWithNewOp<memref::SubViewOp>(		rewriter.replaceOpWithNewOp<memref::SubViewOp>(
subView, subView.getType(), srcSubView.getSource(), newOffsets,		subView, subView.getType(), srcSubView.getSource(), newOffsets,
newSizes, srcSubView.getMixedStrides());		newSizes, srcSubView.getMixedStrides());
return success();		return success();
}		}
};		};

		/// Folds nvgpu.device_async_copy subviews into the copy itself. This pattern
		/// is folds subview on src and dst memref of the copy.
		class NvgpuAsyncCopyOpSubViewOpFolder final
		: public OpRewritePattern<nvgpu::DeviceAsyncCopyOp> {
		public:
		using OpRewritePattern<nvgpu::DeviceAsyncCopyOp>::OpRewritePattern;

		LogicalResult matchAndRewrite(nvgpu::DeviceAsyncCopyOp copyOp,
		PatternRewriter &rewriter) const override;
		};
} // namespace		} // namespace

static SmallVector<Value>		static SmallVector<Value>
calculateExpandedAccessIndices(AffineMap affineMap,		calculateExpandedAccessIndices(AffineMap affineMap,
const SmallVector<Value> &indices, Location loc,		const SmallVector<Value> &indices, Location loc,
PatternRewriter &rewriter) {		PatternRewriter &rewriter) {
SmallVector<OpFoldResult> indicesOfr(llvm::to_vector(		SmallVector<OpFoldResult> indicesOfr(llvm::to_vector(
llvm::map_range(indices, [](Value v) -> OpFoldResult { return v; })));		llvm::map_range(indices, [](Value v) -> OpFoldResult { return v; })));
▲ Show 20 Lines • Show All 281 Lines • ▼ Show 20 Lines	llvm::TypeSwitch<Operation *, void>(storeOp)
rewriter.replaceOpWithNewOp<decltype(op)>(		rewriter.replaceOpWithNewOp<decltype(op)>(
storeOp, storeOp.getValue(), collapseShapeOp.getViewSource(),		storeOp, storeOp.getValue(), collapseShapeOp.getViewSource(),
sourceIndices);		sourceIndices);
})		})
.Default([](Operation *) { llvm_unreachable("unexpected operation."); });		.Default([](Operation *) { llvm_unreachable("unexpected operation."); });
return success();		return success();
}		}

		LogicalResult NvgpuAsyncCopyOpSubViewOpFolder::matchAndRewrite(
		nvgpu::DeviceAsyncCopyOp copyOp, PatternRewriter &rewriter) const {

		LLVM_DEBUG(DBGS() << "copyOp : " << copyOp << "\n");

		Location loc = copyOp.getLoc();
		auto srcSubViewOp =
		copyOp.getSrc().template getDefiningOp<memref::SubViewOp>();
		auto dstSubViewOp =
		copyOp.getDst().template getDefiningOp<memref::SubViewOp>();

		if (!(srcSubViewOp \|\| dstSubViewOp))
		return rewriter.notifyMatchFailure(copyOp, "does not use subview ops for "
		"source or destination");

		// If the source is a subview, we need to resolve the indices.
		SmallVector<Value> srcindices(copyOp.getSrcIndices().begin(),
		copyOp.getSrcIndices().end());
		SmallVector<Value> foldedSrcIndices(srcindices);

		if (srcSubViewOp) {
		LLVM_DEBUG(DBGS() << "srcSubViewOp : " << srcSubViewOp << "\n");
		resolveSourceIndicesOffsetsAndStrides(
		rewriter, copyOp.getLoc(), srcSubViewOp.getMixedOffsets(),
		srcSubViewOp.getMixedStrides(), srcSubViewOp.getDroppedDims(),
		srcindices, foldedSrcIndices);
		}

		// If the destination is a subview, we need to resolve the indices.
		SmallVector<Value> dstindices(copyOp.getDstIndices().begin(),
		copyOp.getDstIndices().end());
		SmallVector<Value> foldedDstIndices(dstindices);

		if (dstSubViewOp) {
		LLVM_DEBUG(DBGS() << "dstSubViewOp : " << dstSubViewOp << "\n");
		resolveSourceIndicesOffsetsAndStrides(
		rewriter, copyOp.getLoc(), dstSubViewOp.getMixedOffsets(),
		dstSubViewOp.getMixedStrides(), dstSubViewOp.getDroppedDims(),
		dstindices, foldedDstIndices);
		}

		// Replace the copy op with a new copy op that uses the source and destination
		// of the subview.
		rewriter.replaceOpWithNewOp<nvgpu::DeviceAsyncCopyOp>(
		copyOp, nvgpu::DeviceAsyncTokenType::get(copyOp.getContext()),
		(dstSubViewOp ? dstSubViewOp.getSource() : copyOp.getDst()),
		foldedDstIndices,
		(srcSubViewOp ? srcSubViewOp.getSource() : copyOp.getSrc()),
		foldedSrcIndices, copyOp.getDstElements(), copyOp.getSrcElements(),
		copyOp.getBypassL1Attr());

		return success();
		}

void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {		void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
patterns.add<LoadOpOfSubViewOpFolder<AffineLoadOp>,		patterns.add<LoadOpOfSubViewOpFolder<AffineLoadOp>,
LoadOpOfSubViewOpFolder<memref::LoadOp>,		LoadOpOfSubViewOpFolder<memref::LoadOp>,
LoadOpOfSubViewOpFolder<vector::TransferReadOp>,		LoadOpOfSubViewOpFolder<vector::TransferReadOp>,
LoadOpOfSubViewOpFolder<gpu::SubgroupMmaLoadMatrixOp>,		LoadOpOfSubViewOpFolder<gpu::SubgroupMmaLoadMatrixOp>,
StoreOpOfSubViewOpFolder<AffineStoreOp>,		StoreOpOfSubViewOpFolder<AffineStoreOp>,
StoreOpOfSubViewOpFolder<memref::StoreOp>,		StoreOpOfSubViewOpFolder<memref::StoreOp>,
StoreOpOfSubViewOpFolder<vector::TransferWriteOp>,		StoreOpOfSubViewOpFolder<vector::TransferWriteOp>,
StoreOpOfSubViewOpFolder<gpu::SubgroupMmaStoreMatrixOp>,		StoreOpOfSubViewOpFolder<gpu::SubgroupMmaStoreMatrixOp>,
LoadOpOfExpandShapeOpFolder<AffineLoadOp>,		LoadOpOfExpandShapeOpFolder<AffineLoadOp>,
LoadOpOfExpandShapeOpFolder<memref::LoadOp>,		LoadOpOfExpandShapeOpFolder<memref::LoadOp>,
StoreOpOfExpandShapeOpFolder<AffineStoreOp>,		StoreOpOfExpandShapeOpFolder<AffineStoreOp>,
StoreOpOfExpandShapeOpFolder<memref::StoreOp>,		StoreOpOfExpandShapeOpFolder<memref::StoreOp>,
LoadOpOfCollapseShapeOpFolder<AffineLoadOp>,		LoadOpOfCollapseShapeOpFolder<AffineLoadOp>,
LoadOpOfCollapseShapeOpFolder<memref::LoadOp>,		LoadOpOfCollapseShapeOpFolder<memref::LoadOp>,
StoreOpOfCollapseShapeOpFolder<AffineStoreOp>,		StoreOpOfCollapseShapeOpFolder<AffineStoreOp>,
StoreOpOfCollapseShapeOpFolder<memref::StoreOp>,		StoreOpOfCollapseShapeOpFolder<memref::StoreOp>,
SubViewOfSubViewFolder>(patterns.getContext());		SubViewOfSubViewFolder, NvgpuAsyncCopyOpSubViewOpFolder>(
		patterns.getContext());
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Pass registration		// Pass registration
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

namespace {		namespace {

Show All 16 Lines

mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir

	Show First 20 Lines • Show All 535 Lines • ▼ Show 20 Lines
	// CHECK-LABEL: func.func @fold_gpu_subgroup_mma_load_matrix_2d			// CHECK-LABEL: func.func @fold_gpu_subgroup_mma_load_matrix_2d
	// CHECK-SAME: %[[DST:.+]]: memref<128x128xf32>			// CHECK-SAME: %[[DST:.+]]: memref<128x128xf32>
	func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %matrix: !gpu.mma_matrix<16x16xf16, "COp">) {			func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %matrix: !gpu.mma_matrix<16x16xf16, "COp">) {
	%subview = memref.subview %arg0[%arg1, %arg2][64, 32][2, 1] : memref<128x128xf32> to memref<64x32xf32, strided<[64, 1], offset: ?>>			%subview = memref.subview %arg0[%arg1, %arg2][64, 32][2, 1] : memref<128x128xf32> to memref<64x32xf32, strided<[64, 1], offset: ?>>
	// CHECK: gpu.subgroup_mma_store_matrix %{{.+}}, %[[DST]][{{.+}}] {leadDimension = 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<128x128xf32>			// CHECK: gpu.subgroup_mma_store_matrix %{{.+}}, %[[DST]][{{.+}}] {leadDimension = 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<128x128xf32>
	gpu.subgroup_mma_store_matrix %matrix, %subview[%arg3, %arg4] {leadDimension = 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<64x32xf32, strided<[64, 1], offset: ?>>			gpu.subgroup_mma_store_matrix %matrix, %subview[%arg3, %arg4] {leadDimension = 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<64x32xf32, strided<[64, 1], offset: ?>>
	return			return
	}			}

				// -----


				func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) {

				%c0 = arith.constant 0 : index
				%smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
				%gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
				%async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
				return
				}

				ThomasRaouxUnsubmitted Done Reply Inline Actions would be good to test cases where the index is not 0 ThomasRaoux: would be good to test cases where the index is not 0
				manishucsdAuthorUnsubmitted Done Reply Inline Actions The test itself is not anchored on c0. After the subview, the the read will be on `%gmem_memref_subview_2d[%c0, %c0]` for all the reads and %c0 is in the original IR is defined to use there. The check is that the subview indices are are used in the `nvgpu.device_async_copy`'s after the folding `%[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]]`, i.e., IDX_1, IDX_2, IDX_3. `%smem_memref_4d` is not folded so we can ignore that for now. manishucsd: The test itself is not anchored on c0. After the subview, the the read will be on…
				ThomasRaouxUnsubmitted Done Reply Inline Actions If the original indices are not zero they need to be combined with the subview indices right? Don’t we miss testing this part? Also that means the folding of the dst subview is not tested? ThomasRaoux: If the original indices are not zero they need to be combined with the subview indices right?
				// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx
				// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index)
				// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
				// CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
				// CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>

				// -----


				func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) {
				%c0 = arith.constant 0 : index
				%smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
				%gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
				%async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
				return
				}

				// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
				// CHECK: func.func @fold_src_nvgpu_device_async_copy
				// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index)
				// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
				// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
				// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
				// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>

				// -----


				func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) {
				%c0 = arith.constant 0 : index
				%smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
				%gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
				%smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
				%async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
				return
				}

				// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
				// CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy
				// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index)
				// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
				// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
				// CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]]
				// CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]]
				// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Memref] Fold nvgpu.device_async_copy on on src memref- to dst memref-subviews
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 513352

mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp

mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Memref] Fold nvgpu.device_async_copy on on src memref- to dst memref-subviewsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 513352

mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp

mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir

[mlir][Memref] Fold nvgpu.device_async_copy on on src memref- to dst memref-subviews
ClosedPublic