This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Dialect/Vector/Transforms/
-
Dialect/
-
Vector/
-
Transforms/
-
VectorDistribute.cpp
-
test/Dialect/Vector/
-
Dialect/
-
Vector/
1
vector-warp-distribute.mlir

Differential D154870

[mlir] Distribute single-element shape cast in PropagateWarpVectorDistributionPatterns
Needs ReviewPublic

Authored by pzread on Jul 10 2023, 10:58 AM.

Download Raw Diff

Details

Reviewers

aartbik
antiagainst
nicolasvasilache
dcaballe
ThomasRaoux

Summary

Vector shape casts between single element vectors (vector<1xf32>) and zero dim vectors (vector<f32>) are created when dropping unit dims on transfer read/write.

For example:

%13 = vector.transfer_read %subview_14[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
%14 = vector.shape_cast %13 : vector<i32> to vector<1xi32>
%15 = arith.muli %8, %cst_1 : vector<1xi32>
%16 = arith.subi %12, %15 : vector<1xi32>
%17 = arith.addi %10, %16 : vector<1xi32>
%18 = "tosa.apply_scale"(%17, %14, %cst_2) <{double_round = true}> : (vector<1xi32>, vector<1xi32>, vector<1xi8>) -> vector<1xi32>
%19 = arith.addi %18, %cst_3 : vector<1xi32>
%20 = arith.cmpi slt, %19, %cst_1 : vector<1xi32>
%21 = arith.select %20, %cst_1, %19 : vector<1xi1>, vector<1xi32>
%22 = arith.cmpi sgt, %19, %cst_4 : vector<1xi32>
%23 = arith.select %22, %cst_4, %21 : vector<1xi1>, vector<1xi32>
%24 = arith.trunci %23 : vector<1xi32> to vector<1xi8>
%25 = arith.sitofp %24 : vector<1xi8> to vector<1xf32>
%26 = arith.subf %25, %cst_5 : vector<1xf32>
%27 = arith.mulf %26, %cst_6 : vector<1xf32>
%subview_15 = memref.subview %subview[0] [1] [1] : memref<1xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>
%28 = vector.shape_cast %27 : vector<1xf32> to vector<f32>
vector.transfer_write %28, %subview_15[] : vector<f32>, memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>

Add a pattern to distribute those trivial shape casts.

Multi-element shape casts can also happen when dropping unit dims (e.g. vector<8x1> to vector<8>). Those can be handled later when we have an example.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

pzread created this revision.Jul 10 2023, 10:58 AM

Herald added a reviewer: aartbik. · View Herald TranscriptJul 10 2023, 10:58 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: bviyer, Moerafaat, zero9178 and 24 others. · View Herald Transcript

pzread edited the summary of this revision. (Show Details)Jul 10 2023, 11:05 AM

pzread added a reviewer: antiagainst.

Herald added a subscriber: eric-k256. · View Herald TranscriptJul 10 2023, 11:05 AM

pzread published this revision for review.Jul 10 2023, 11:05 AM

pzread edited the summary of this revision. (Show Details)

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptJul 10 2023, 11:07 AM

Herald added a reviewer: dcaballe. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

pzread edited the summary of this revision. (Show Details)Jul 10 2023, 11:08 AM

dcaballe added a reviewer: ThomasRaoux.Jul 11 2023, 10:27 AM

Kindly ping : )

antiagainst added inline comments.Aug 17 2023, 8:40 AM

mlir/test/Dialect/Vector/vector-warp-distribute.mlir
1188	Actually this is just broadcast? Have you considered adding a canonicalization pattern to turn this into broadcast? Then we can leverage the existing `WarpOnBroadcast` pattern. It could be more widely applicable and help to clean other places too.

Herald added a subscriber: sunshaoce. · View Herald TranscriptAug 17 2023, 8:40 AM

Revision Contents

Path

Size

mlir/

lib/

Dialect/

Vector/

Transforms/

VectorDistribute.cpp

46 lines

test/

Dialect/

Vector/

vector-warp-distribute.mlir

39 lines

Diff 538739

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp

Show First 20 Lines • Show All 877 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
}		}
if (!valForwarded)		if (!valForwarded)
return failure();		return failure();
rewriter.replaceAllUsesWith(warpOp.getResult(resultIndex), valForwarded);		rewriter.replaceAllUsesWith(warpOp.getResult(resultIndex), valForwarded);
return success();		return success();
}		}
};		};

		struct WarpOpShapeCast : public OpRewritePattern<WarpExecuteOnLane0Op> {
		using OpRewritePattern<WarpExecuteOnLane0Op>::OpRewritePattern;
		LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
		PatternRewriter &rewriter) const override {
		OpOperand *operand = getWarpResult(
		warpOp, [](Operation *op) { return isa<vector::ShapeCastOp>(op); });
		if (!operand)
		return failure();
		unsigned int operandNumber = operand->getOperandNumber();
		auto destVecType =
		cast<VectorType>(warpOp->getResultTypes()[operandNumber]);
		auto shapeCast = operand->get().getDefiningOp<vector::ShapeCastOp>();
		Location loc = shapeCast.getLoc();
		Value shapeCastSrc = shapeCast.getSource();
		auto shapeCastSrcType = cast<VectorType>(shapeCastSrc.getType());

		// Only handle the trivial shape cast with a single element for now.
		// TODO: Support more cases.
		if (shapeCastSrcType.getNumElements() != 1 \|\|
		destVecType.getNumElements() != 1)
		return failure();

		// For the single element shape cast, the source is broadcasted to all
		// lanes, and each lane casts the source into the target shape. This is
		// always possible because it's 1-element-to-1-element casting.
		SmallVector<size_t> newRetIndices;
		WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
		rewriter, warpOp, {shapeCastSrc}, {shapeCastSrcType}, newRetIndices);
		rewriter.setInsertionPointAfter(newWarpOp);
		Value newShapeCast = rewriter.create<vector::ShapeCastOp>(
		loc, destVecType, newWarpOp->getResult(newRetIndices[0]));
		rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
		newShapeCast);
		return success();
		}
		};

struct WarpOpBroadcast : public OpRewritePattern<WarpExecuteOnLane0Op> {		struct WarpOpBroadcast : public OpRewritePattern<WarpExecuteOnLane0Op> {
using OpRewritePattern<WarpExecuteOnLane0Op>::OpRewritePattern;		using OpRewritePattern<WarpExecuteOnLane0Op>::OpRewritePattern;
LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,		LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(		OpOperand *operand = getWarpResult(
warpOp, [](Operation *op) { return isa<vector::BroadcastOp>(op); });		warpOp, [](Operation *op) { return isa<vector::BroadcastOp>(op); });
if (!operand)		if (!operand)
return failure();		return failure();
▲ Show 20 Lines • Show All 660 Lines • ▼ Show 20 Lines	void mlir::vector::populateDistributeTransferWriteOpPatterns(
PatternBenefit benefit) {		PatternBenefit benefit) {
patterns.add<WarpOpTransferWrite>(patterns.getContext(), distributionMapFn,		patterns.add<WarpOpTransferWrite>(patterns.getContext(), distributionMapFn,
benefit);		benefit);
}		}

void mlir::vector::populatePropagateWarpVectorDistributionPatterns(		void mlir::vector::populatePropagateWarpVectorDistributionPatterns(
RewritePatternSet &patterns, const DistributionMapFn &distributionMapFn,		RewritePatternSet &patterns, const DistributionMapFn &distributionMapFn,
const WarpShuffleFromIdxFn &warpShuffleFromIdxFn, PatternBenefit benefit) {		const WarpShuffleFromIdxFn &warpShuffleFromIdxFn, PatternBenefit benefit) {
patterns.add<WarpOpElementwise, WarpOpTransferRead, WarpOpDeadResult,		patterns
WarpOpBroadcast, WarpOpExtract, WarpOpForwardOperand,		.add<WarpOpElementwise, WarpOpTransferRead, WarpOpDeadResult,
WarpOpConstant, WarpOpInsertElement, WarpOpInsert>(		WarpOpBroadcast, WarpOpExtract, WarpOpForwardOperand, WarpOpConstant,
		WarpOpInsertElement, WarpOpInsert, WarpOpShapeCast>(
patterns.getContext(), benefit);		patterns.getContext(), benefit);
patterns.add<WarpOpExtractElement>(patterns.getContext(),		patterns.add<WarpOpExtractElement>(patterns.getContext(),
warpShuffleFromIdxFn, benefit);		warpShuffleFromIdxFn, benefit);
patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,		patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,
benefit);		benefit);
}		}

void mlir::vector::populateDistributeReduction(		void mlir::vector::populateDistributeReduction(
RewritePatternSet &patterns,		RewritePatternSet &patterns,
Show All 33 Lines

mlir/test/Dialect/Vector/vector-warp-distribute.mlir

Show First 20 Lines • Show All 1,167 Lines • ▼ Show 20 Lines	func.func @dont_fold_vector_broadcast(%laneid: index) {
%r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) {		%r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) {
%0 = "some_def"() : () -> (vector<64xf32>)		%0 = "some_def"() : () -> (vector<64xf32>)
%1 = vector.broadcast %0 : vector<64xf32> to vector<1x64xf32>		%1 = vector.broadcast %0 : vector<64xf32> to vector<1x64xf32>
vector.yield %1 : vector<1x64xf32>		vector.yield %1 : vector<1x64xf32>
}		}
vector.print %r : vector<1x2xf32>		vector.print %r : vector<1x2xf32>
return		return
}		}

		// -----

		// CHECK-PROP-LABEL: func @distribute_single_element_shape_cast(
		// CHECK-PROP: %[[r:.]] = vector.warp_execute_on_lane_0{{.}} -> (vector<f32>)
		// CHECK-PROP: %[[some_def:.*]] = "some_def"
		// CHECK-PROP: vector.yield %[[some_def]] : vector<f32>
		// CHECK-PROP: %[[s:.*]] = vector.shape_cast %[[r]] : vector<f32> to vector<1x1xf32>
		// CHECK-PROP: vector.print %[[s]] : vector<1x1xf32>
		func.func @distribute_single_element_shape_cast(%laneid: index) {
		%r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) {
		%0 = "some_def"() : () -> (vector<f32>)
		%1 = vector.shape_cast %0 : vector<f32> to vector<1x1xf32>
		antiagainstUnsubmitted Not Done Reply Inline Actions Actually this is just broadcast? Have you considered adding a canonicalization pattern to turn this into broadcast? Then we can leverage the existing `WarpOnBroadcast` pattern. It could be more widely applicable and help to clean other places too. antiagainst: Actually this is just broadcast? Have you considered adding a canonicalization pattern to turn…
		vector.yield %1 : vector<1x1xf32>
		}
		vector.print %r : vector<1x1xf32>
		return
		}

		// -----

		// TODO: Distribute non-trivial shape cast when possible.

		// CHECK-PROP-LABEL: func @dont_distribute_nontrivial_shape_cast(
		// CHECK-PROP: %[[r:.]] = vector.warp_execute_on_lane_0{{.}} -> (vector<1x2xf32>)
		// CHECK-PROP: %[[some_def:.*]] = "some_def"
		// CHECK-PROP: %[[s:.*]] = vector.shape_cast %[[some_def]] : vector<2x32xf32> to vector<32x2xf32>
		// CHECK-PROP: vector.yield %[[s]] : vector<32x2xf32>
		// CHECK-PROP: vector.print %[[r]] : vector<1x2xf32>
		func.func @dont_distribute_nontrivial_shape_cast(%laneid: index) {
		%r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) {
		%0 = "some_def"() : () -> (vector<2x32xf32>)
		%1 = vector.shape_cast %0 : vector<2x32xf32> to vector<32x2xf32>
		vector.yield %1 : vector<32x2xf32>
		}
		vector.print %r : vector<1x2xf32>
		return
		}