This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Dialect/Vector/Transforms/
-
Dialect/
-
Vector/
-
Transforms/
4/5
VectorDistribute.cpp
-
test/Dialect/Vector/
-
Dialect/
-
Vector/
-
vector-warp-distribute.mlir

Differential D133826

[mlir][vector] Clean up and generalize lowering of warp_execute to scf
ClosedPublic

Authored by ThomasRaoux on Sep 13 2022, 7:14 PM.

Download Raw Diff

Details

Reviewers

nicolasvasilache
aartbik
dcaballe

Commits

rG4abb9e5d2054: [mlir][vector] Clean up and generalize lowering of warp_execute to scf

Summary

Simplify the lowering of warp_execute_on_lane0 of scf.if by making the
logic more generic. Also remove the assumption that the most inner
dimension is the dimension distributed.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

ThomasRaoux created this revision.Sep 13 2022, 7:14 PM

Herald added a reviewer: aartbik. · View Herald TranscriptSep 13 2022, 7:14 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: bzcheeseman, sdasgup3, wenzhicui and 18 others. · View Herald Transcript

ThomasRaoux requested review of this revision.Sep 13 2022, 7:14 PM

Herald added a reviewer: dcaballe. · View Herald TranscriptSep 13 2022, 7:14 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added a subscriber: stephenneuendorffer. · View Herald Transcript

Harbormaster completed remote builds in B186515: Diff 459957.Sep 13 2022, 7:34 PM

Nice!

Please check no regressions in IREE and add new tests if something flares up.

Thanks!

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
36	`perm.reserve(...)`
37	There is no yield value here, just use seq and dist types plz.
46	Shouldn't there be some sanity checks here and described in the doc? I.e. things that divide modulo warp size, `size(perm) == rank \|\| size(perm) == rank - 1` etc ?

This revision is now accepted and ready to land.Sep 14 2022, 8:34 AM

nicolasvasilache added inline comments.Sep 14 2022, 8:34 AM

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
58	nit: inferred

Address review comments

ThomasRaoux marked 3 inline comments as done.Sep 14 2022, 10:20 AM

ThomasRaoux added inline comments.

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
46	The checks are already there in the verifier so duplicating it wouldn't make a lot of sense. I added an assert checking that only one dim is distributed since this code wouldn't work otherwise.

This revision was landed with ongoing or failed builds.Sep 14 2022, 10:36 AM

Closed by commit rG4abb9e5d2054: [mlir][vector] Clean up and generalize lowering of warp_execute to scf (authored by ThomasRaoux). · Explain Why

This revision was automatically updated to reflect the committed changes.

ThomasRaoux added a commit: rG4abb9e5d2054: [mlir][vector] Clean up and generalize lowering of warp_execute to scf.

Harbormaster completed remote builds in B186665: Diff 460146.Sep 14 2022, 10:43 AM

Revision Contents

Path

Size

mlir/

lib/

Dialect/

Vector/

Transforms/

VectorDistribute.cpp

180 lines

test/

Dialect/

Vector/

vector-warp-distribute.mlir

42 lines

Diff 460151

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp

Show All 14 Lines
#include "mlir/IR/AffineExpr.h"		#include "mlir/IR/AffineExpr.h"
#include "mlir/Transforms/SideEffectUtils.h"		#include "mlir/Transforms/SideEffectUtils.h"
#include "llvm/ADT/SetVector.h"		#include "llvm/ADT/SetVector.h"
#include <utility>		#include <utility>

using namespace mlir;		using namespace mlir;
using namespace mlir::vector;		using namespace mlir::vector;

/// TODO: add an analysis step that determines which vector dimension should be		/// Currently the distribution map is implicit based on the vector shape. In the
/// used for distribution.		/// future it will be part of the op.
static llvm::Optional<int64_t>		/// Example:
getDistributedVectorDim(VectorType distributedVectorType) {		/// ```
return (distributedVectorType)		/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1x16x2xf32>) {
? llvm::Optional<int64_t>(distributedVectorType.getRank() - 1)		/// ...
: llvm::None;		/// vector.yield %3 : vector<32x16x64xf32>
}		/// }
		/// ```
static llvm::Optional<int64_t>		/// Would have an implicit map of:
getDistributedSize(VectorType distributedVectorType) {		/// `(d0, d1, d2) -> (d0, d2)`
auto dim = getDistributedVectorDim(distributedVectorType);		static AffineMap calculateImplicitMap(VectorType sequentialType,
return (dim) ? llvm::Optional<int64_t>(distributedVectorType.getDimSize(*dim))		VectorType distributedType) {
: llvm::None;		SmallVector<AffineExpr> perm;
		nicolasvasilacheUnsubmitted Done Reply Inline Actions `perm.reserve(...)` nicolasvasilache: `perm.reserve(...)`
		perm.reserve(1);
		nicolasvasilacheUnsubmitted Done Reply Inline Actions There is no yield value here, just use seq and dist types plz. nicolasvasilache: There is no yield value here, just use seq and dist types plz.
		// Check which dimensions of the sequential type are different than the
		// dimensions of the distributed type to know the distributed dimensions. Then
		// associate each distributed dimension to an ID in order.
		for (unsigned i = 0, e = sequentialType.getRank(); i < e; i++) {
		if (sequentialType.getDimSize(i) != distributedType.getDimSize(i))
		perm.push_back(getAffineDimExpr(i, distributedType.getContext()));
		}
		auto map = AffineMap::get(sequentialType.getRank(), 0, perm,
		distributedType.getContext());
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Shouldn't there be some sanity checks here and described in the doc? I.e. things that divide modulo warp size, `size(perm) == rank \|\| size(perm) == rank - 1` etc ? nicolasvasilache: Shouldn't there be some sanity checks here and described in the doc? I.e. things that divide…
		ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions The checks are already there in the verifier so duplicating it wouldn't make a lot of sense. I added an assert checking that only one dim is distributed since this code wouldn't work otherwise. ThomasRaoux: The checks are already there in the verifier so duplicating it wouldn't make a lot of sense. I…
		assert(map.getNumResults() <= 1 &&
		"only support distribution along one dimension for now.");
		return map;
}		}

namespace {		namespace {

/// Helper struct to create the load / store operations that permit transit		/// Helper struct to create the load / store operations that permit transit
/// through the parallel / sequential and the sequential / parallel boundaries		/// through the parallel / sequential and the sequential / parallel boundaries
/// when performing `rewriteWarpOpToScfFor`.		/// when performing `rewriteWarpOpToScfFor`.
///		///
/// All this assumes the vector distribution occurs along the most minor		/// The vector distribution dimension is inferred from the vector types.
		nicolasvasilacheUnsubmitted Done Reply Inline Actions nit: inferred nicolasvasilache: nit: inferred
/// distributed vector dimension.
/// TODO: which is expected to be a multiple of the warp size ?
/// TODO: add an analysis step that determines which vector dimension should
/// be used for distribution.
struct DistributedLoadStoreHelper {		struct DistributedLoadStoreHelper {
DistributedLoadStoreHelper(Value sequentialVal, Value distributedVal,		DistributedLoadStoreHelper(Value sequentialVal, Value distributedVal,
Value laneId, Value zero)		Value laneId, Value zero)
: sequentialVal(sequentialVal), distributedVal(distributedVal),		: sequentialVal(sequentialVal), distributedVal(distributedVal),
laneId(laneId), zero(zero) {		laneId(laneId), zero(zero) {
sequentialType = sequentialVal.getType();		sequentialVectorType = sequentialVal.getType().dyn_cast<VectorType>();
distributedType = distributedVal.getType();		distributedVectorType = distributedVal.getType().dyn_cast<VectorType>();
sequentialVectorType = sequentialType.dyn_cast<VectorType>();		if (sequentialVectorType && distributedVectorType)
distributedVectorType = distributedType.dyn_cast<VectorType>();		distributionMap =
		calculateImplicitMap(sequentialVectorType, distributedVectorType);
}		}

Value buildDistributedOffset(RewriterBase &b, Location loc) {		Value buildDistributedOffset(RewriterBase &b, Location loc, int64_t index) {
auto maybeDistributedSize = getDistributedSize(distributedVectorType);		int64_t distributedSize = distributedVectorType.getDimSize(index);
assert(maybeDistributedSize &&
"at this point, a distributed size must be determined");
AffineExpr tid = getAffineSymbolExpr(0, b.getContext());		AffineExpr tid = getAffineSymbolExpr(0, b.getContext());
return b.createOrFold<AffineApplyOp>(loc, tid * (*maybeDistributedSize),		return b.createOrFold<AffineApplyOp>(loc, tid * distributedSize,
ArrayRef<Value>{laneId});		ArrayRef<Value>{laneId});
}		}

/// Create a store during the process of distributing the		/// Create a store during the process of distributing the
/// `vector.warp_execute_on_thread_0` op.		/// `vector.warp_execute_on_thread_0` op.
/// Vector distribution assumes the following convention regarding the		/// Vector distribution assumes the following convention regarding the
/// temporary buffers that are created to transition values. This must		/// temporary buffers that are created to transition values. This must
/// be properly specified in the `options.warpAllocationFn`:		/// be properly specified in the `options.warpAllocationFn`:
/// 1. scalars of type T transit through a memref<1xT>.		/// 1. scalars of type T transit through a memref<1xT>.
/// 2. vectors of type V<shapexT> transit through a memref<shapexT>		/// 2. vectors of type V<shapexT> transit through a memref<shapexT>
Operation *buildStore(RewriterBase &b, Location loc, Value val,		Operation *buildStore(RewriterBase &b, Location loc, Value val,
Value buffer) {		Value buffer) {
assert((val == distributedVal \|\| val == sequentialVal) &&		assert((val == distributedVal \|\| val == sequentialVal) &&
"Must store either the preregistered distributed or the "		"Must store either the preregistered distributed or the "
"preregistered sequential value.");		"preregistered sequential value.");
		// Scalar case can directly use memref.store.
		if (!val.getType().isa<VectorType>())
		return b.create<memref::StoreOp>(loc, val, buffer, zero);

// Vector case must use vector::TransferWriteOp which will later lower to		// Vector case must use vector::TransferWriteOp which will later lower to
// vector.store of memref.store depending on further lowerings.		// vector.store of memref.store depending on further lowerings.
if (val.getType().isa<VectorType>()) {
int64_t rank = sequentialVectorType.getRank();		int64_t rank = sequentialVectorType.getRank();
if (rank == 0) {
return b.create<vector::TransferWriteOp>(loc, val, buffer, ValueRange{},
ArrayRef<bool>{});
}
SmallVector<Value> indices(rank, zero);		SmallVector<Value> indices(rank, zero);
auto maybeDistributedDim = getDistributedVectorDim(distributedVectorType);		if (val == distributedVal) {
assert(maybeDistributedDim && "must be able to deduce distributed dim");		for (auto dimExpr : distributionMap.getResults()) {
if (val == distributedVal)		int64_t index = dimExpr.cast<AffineDimExpr>().getPosition();
indices[*maybeDistributedDim] =		indices[index] = buildDistributedOffset(b, loc, index);
(val == distributedVal) ? buildDistributedOffset(b, loc) : zero;		}
		}
SmallVector<bool> inBounds(indices.size(), true);		SmallVector<bool> inBounds(indices.size(), true);
return b.create<vector::TransferWriteOp>(		return b.create<vector::TransferWriteOp>(
loc, val, buffer, indices,		loc, val, buffer, indices,
ArrayRef<bool>(inBounds.begin(), inBounds.end()));		ArrayRef<bool>(inBounds.begin(), inBounds.end()));
}		}
// Scalar case can directly use memref.store.
return b.create<memref::StoreOp>(loc, val, buffer, zero);
}

/// Create a load during the process of distributing the		/// Create a load during the process of distributing the
/// `vector.warp_execute_on_thread_0` op.		/// `vector.warp_execute_on_thread_0` op.
/// Vector distribution assumes the following convention regarding the		/// Vector distribution assumes the following convention regarding the
/// temporary buffers that are created to transition values. This must		/// temporary buffers that are created to transition values. This must
/// be properly specified in the `options.warpAllocationFn`:		/// be properly specified in the `options.warpAllocationFn`:
/// 1. scalars of type T transit through a memref<1xT>.		/// 1. scalars of type T transit through a memref<1xT>.
/// 2. vectors of type V<shapexT> transit through a memref<shapexT>		/// 2. vectors of type V<shapexT> transit through a memref<shapexT>
///		///
/// When broadcastMode is true, the load is not distributed to account for		/// When broadcastMode is true, the load is not distributed to account for
/// the broadcast semantics of the `vector.warp_execute_on_lane_0` op.		/// the broadcast semantics of the `vector.warp_execute_on_lane_0` op.
///		///
/// Example:		/// Example:
///		///
/// ```		/// ```
/// %r = vector.warp_execute_on_lane_0(...) -> (f32) {		/// %r = vector.warp_execute_on_lane_0(...) -> (f32) {
/// vector.yield %cst : f32		/// vector.yield %cst : f32
/// }		/// }
/// // Both types are f32. The constant %cst is broadcasted to all lanes.		/// // Both types are f32. The constant %cst is broadcasted to all lanes.
/// ```		/// ```
/// This behavior described in more detail in the documentation of the op.		/// This behavior described in more detail in the documentation of the op.
Value buildLoad(RewriterBase &b, Location loc, Type type, Value buffer,		Value buildLoad(RewriterBase &b, Location loc, Type type, Value buffer) {
bool broadcastMode = false) {
if (broadcastMode) {		// Scalar case can directly use memref.store.
// Broadcast mode may occur for either scalar or vector operands.		if (!type.isa<VectorType>())
auto vectorType = type.dyn_cast<VectorType>();
auto shape = buffer.getType().cast<MemRefType>();
if (vectorType) {
SmallVector<bool> inBounds(shape.getRank(), true);
return b.create<vector::TransferReadOp>(
loc, vectorType, buffer,
/indices=/SmallVector<Value>(shape.getRank(), zero),
ArrayRef<bool>(inBounds.begin(), inBounds.end()));
}
return b.create<memref::LoadOp>(loc, buffer, zero);		return b.create<memref::LoadOp>(loc, buffer, zero);
}

// Other cases must be vector atm.		// Other cases must be vector atm.
// Vector case must use vector::TransferReadOp which will later lower to		// Vector case must use vector::TransferReadOp which will later lower to
// vector.read of memref.read depending on further lowerings.		// vector.read of memref.read depending on further lowerings.
assert(type.isa<VectorType>() && "must be a vector type");
assert((type == distributedVectorType \|\| type == sequentialVectorType) &&		assert((type == distributedVectorType \|\| type == sequentialVectorType) &&
"Must store either the preregistered distributed or the "		"Must store either the preregistered distributed or the "
"preregistered sequential type.");		"preregistered sequential type.");
auto maybeDistributedDim = getDistributedVectorDim(distributedVectorType);
assert(maybeDistributedDim && "must be able to deduce distributed dim");
SmallVector<Value> indices(sequentialVectorType.getRank(), zero);		SmallVector<Value> indices(sequentialVectorType.getRank(), zero);
if (type == distributedVectorType) {		if (type == distributedVectorType) {
indices[*maybeDistributedDim] = buildDistributedOffset(b, loc);		for (auto dimExpr : distributionMap.getResults()) {
} else {		int64_t index = dimExpr.cast<AffineDimExpr>().getPosition();
indices[*maybeDistributedDim] = zero;		indices[index] = buildDistributedOffset(b, loc, index);
		}
}		}
SmallVector<bool> inBounds(indices.size(), true);		SmallVector<bool> inBounds(indices.size(), true);
return b.create<vector::TransferReadOp>(		return b.create<vector::TransferReadOp>(
loc, type.cast<VectorType>(), buffer, indices,		loc, type.cast<VectorType>(), buffer, indices,
ArrayRef<bool>(inBounds.begin(), inBounds.end()));		ArrayRef<bool>(inBounds.begin(), inBounds.end()));
}		}

Value sequentialVal, distributedVal, laneId, zero;		Value sequentialVal, distributedVal, laneId, zero;
Type sequentialType, distributedType;
VectorType sequentialVectorType, distributedVectorType;		VectorType sequentialVectorType, distributedVectorType;
		AffineMap distributionMap;
};		};

} // namespace		} // namespace

/// Helper to create a new WarpExecuteOnLane0Op with different signature.		/// Helper to create a new WarpExecuteOnLane0Op with different signature.
static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(		static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,		RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
ValueRange newYieldedValues, TypeRange newReturnTypes) {		ValueRange newYieldedValues, TypeRange newReturnTypes) {
▲ Show 20 Lines • Show All 84 Lines • ▼ Show 20 Lines	static Operation *cloneOpWithOperandsAndTypes(RewriterBase &rewriter,
Location loc, Operation *op,		Location loc, Operation *op,
ArrayRef<Value> operands,		ArrayRef<Value> operands,
ArrayRef<Type> resultTypes) {		ArrayRef<Type> resultTypes) {
OperationState res(loc, op->getName().getStringRef(), operands, resultTypes,		OperationState res(loc, op->getName().getStringRef(), operands, resultTypes,
op->getAttrs());		op->getAttrs());
return rewriter.create(res);		return rewriter.create(res);
}		}

/// Currently the distribution map is implicit based on the vector shape. In the
/// future it will be part of the op.
/// Example:
/// ```
/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1x16x2xf32>) {
/// ...
/// vector.yield %3 : vector<32x16x64xf32>
/// }
/// ```
/// Would have an implicit map of:
/// `(d0, d1, d2) -> (d0, d2)`
static AffineMap calculateImplicitMap(Value yield, Value ret) {
auto srcType = yield.getType().cast<VectorType>();
auto dstType = ret.getType().cast<VectorType>();
SmallVector<AffineExpr> perm;
// Check which dimensions of the yield value are different than the dimensions
// of the result to know the distributed dimensions. Then associate each
// distributed dimension to an ID in order.
for (unsigned i = 0, e = srcType.getRank(); i < e; i++) {
if (srcType.getDimSize(i) != dstType.getDimSize(i))
perm.push_back(getAffineDimExpr(i, yield.getContext()));
}
auto map = AffineMap::get(srcType.getRank(), 0, perm, yield.getContext());
return map;
}

namespace {		namespace {

/// Rewrite a WarpExecuteOnLane0Op into a predicated scf.if op where the single		/// Rewrite a WarpExecuteOnLane0Op into a predicated scf.if op where the single
/// thread `laneId` executes the entirety of the computation.		/// thread `laneId` executes the entirety of the computation.
///		///
/// After the transformation:		/// After the transformation:
/// - the IR within the scf.if op can be thought of as executing sequentially		/// - the IR within the scf.if op can be thought of as executing sequentially
/// (from the point of view of threads along `laneId`).		/// (from the point of view of threads along `laneId`).
Show All 14 Lines
/// 5. Insert appropriate writes within scf.if and reads after the scf.if to		/// 5. Insert appropriate writes within scf.if and reads after the scf.if to
/// transit the values returned by the op.		/// transit the values returned by the op.
/// 6. Synchronize after the scf.if to ensure all writes inserted in 5. are		/// 6. Synchronize after the scf.if to ensure all writes inserted in 5. are
/// consistent after the scf.if.		/// consistent after the scf.if.
/// 7. Perform late cleanups.		/// 7. Perform late cleanups.
///		///
/// All this assumes the vector distribution occurs along the most minor		/// All this assumes the vector distribution occurs along the most minor
/// distributed vector dimension.		/// distributed vector dimension.
/// TODO: which is expected to be a multiple of the warp size ?		struct WarpOpToScfIfPattern : public OpRewritePattern<WarpExecuteOnLane0Op> {
/// TODO: add an analysis step that determines which vector dimension should be		WarpOpToScfIfPattern(MLIRContext *context,
/// used for distribution.
struct WarpOpToScfForPattern : public OpRewritePattern<WarpExecuteOnLane0Op> {
WarpOpToScfForPattern(MLIRContext *context,
const WarpExecuteOnLane0LoweringOptions &options,		const WarpExecuteOnLane0LoweringOptions &options,
PatternBenefit benefit = 1)		PatternBenefit benefit = 1)
: OpRewritePattern<WarpExecuteOnLane0Op>(context, benefit),		: OpRewritePattern<WarpExecuteOnLane0Op>(context, benefit),
options(options) {}		options(options) {}

LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,		LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
assert(warpOp.getBodyRegion().hasOneBlock() &&		assert(warpOp.getBodyRegion().hasOneBlock() &&
"expected WarpOp with single block");		"expected WarpOp with single block");
Block *warpOpBody = &warpOp.getBodyRegion().front();		Block *warpOpBody = &warpOp.getBodyRegion().front();
Show All 23 Lines	for (const auto &it : llvm::enumerate(warpOp.getArgs())) {
// Create buffer before the ifOp.		// Create buffer before the ifOp.
rewriter.setInsertionPoint(ifOp);		rewriter.setInsertionPoint(ifOp);
Value buffer = options.warpAllocationFn(loc, rewriter, warpOp,		Value buffer = options.warpAllocationFn(loc, rewriter, warpOp,
sequentialVal.getType());		sequentialVal.getType());
// Store distributed vector into buffer, before the ifOp.		// Store distributed vector into buffer, before the ifOp.
helper.buildStore(rewriter, loc, distributedVal, buffer);		helper.buildStore(rewriter, loc, distributedVal, buffer);
// Load sequential vector from buffer, inside the ifOp.		// Load sequential vector from buffer, inside the ifOp.
rewriter.setInsertionPointToStart(ifOp.thenBlock());		rewriter.setInsertionPointToStart(ifOp.thenBlock());
bool broadcastMode =		bbArgReplacements.push_back(
(sequentialVal.getType() == distributedVal.getType());		helper.buildLoad(rewriter, loc, sequentialVal.getType(), buffer));
bbArgReplacements.push_back(helper.buildLoad(
rewriter, loc, sequentialVal.getType(), buffer, broadcastMode));
}		}

// Step 3. Insert sync after all the stores and before all the loads.		// Step 3. Insert sync after all the stores and before all the loads.
if (!warpOp.getArgs().empty()) {		if (!warpOp.getArgs().empty()) {
rewriter.setInsertionPoint(ifOp);		rewriter.setInsertionPoint(ifOp);
options.warpSyncronizationFn(loc, rewriter, warpOp);		options.warpSyncronizationFn(loc, rewriter, warpOp);
}		}

Show All 20 Lines	for (const auto &it : llvm::enumerate(yieldOp.operands())) {

// Store yielded value into buffer, inside the ifOp, before the		// Store yielded value into buffer, inside the ifOp, before the
// terminator.		// terminator.
rewriter.setInsertionPoint(yieldOp);		rewriter.setInsertionPoint(yieldOp);
helper.buildStore(rewriter, loc, sequentialVal, buffer);		helper.buildStore(rewriter, loc, sequentialVal, buffer);

// Load distributed value from buffer, after the warpOp.		// Load distributed value from buffer, after the warpOp.
rewriter.setInsertionPointAfter(ifOp);		rewriter.setInsertionPointAfter(ifOp);
bool broadcastMode =
(sequentialVal.getType() == distributedVal.getType());
// Result type and yielded value type are the same. This is a broadcast.		// Result type and yielded value type are the same. This is a broadcast.
// E.g.:		// E.g.:
// %r = vector.warp_execute_on_lane_0(...) -> (f32) {		// %r = vector.warp_execute_on_lane_0(...) -> (f32) {
// vector.yield %cst : f32		// vector.yield %cst : f32
// }		// }
// Both types are f32. The constant %cst is broadcasted to all lanes.		// Both types are f32. The constant %cst is broadcasted to all lanes.
// This is described in more detail in the documentation of the op.		// This is described in more detail in the documentation of the op.
replacements.push_back(helper.buildLoad(		replacements.push_back(
rewriter, loc, distributedVal.getType(), buffer, broadcastMode));		helper.buildLoad(rewriter, loc, distributedVal.getType(), buffer));
}		}

// Step 6. Insert sync after all the stores and before all the loads.		// Step 6. Insert sync after all the stores and before all the loads.
if (!yieldOp.operands().empty()) {		if (!yieldOp.operands().empty()) {
rewriter.setInsertionPointAfter(ifOp);		rewriter.setInsertionPointAfter(ifOp);
options.warpSyncronizationFn(loc, rewriter, warpOp);		options.warpSyncronizationFn(loc, rewriter, warpOp);
}		}

▲ Show 20 Lines • Show All 327 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
// Don't duplicate transfer_read ops when distributing.		// Don't duplicate transfer_read ops when distributing.
if (!read.getResult().hasOneUse())		if (!read.getResult().hasOneUse())
return failure();		return failure();
unsigned operandIndex = operand->getOperandNumber();		unsigned operandIndex = operand->getOperandNumber();
Value distributedVal = warpOp.getResult(operandIndex);		Value distributedVal = warpOp.getResult(operandIndex);

SmallVector<Value, 4> indices(read.getIndices().begin(),		SmallVector<Value, 4> indices(read.getIndices().begin(),
read.getIndices().end());		read.getIndices().end());
AffineMap map = calculateImplicitMap(read.getResult(), distributedVal);		auto sequentialType = read.getResult().getType().cast<VectorType>();
		auto distributedType = distributedVal.getType().cast<VectorType>();
		AffineMap map = calculateImplicitMap(sequentialType, distributedType);
AffineMap indexMap = map.compose(read.getPermutationMap());		AffineMap indexMap = map.compose(read.getPermutationMap());
OpBuilder::InsertionGuard g(rewriter);		OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPointAfter(warpOp);		rewriter.setInsertionPointAfter(warpOp);
for (auto it : llvm::zip(indexMap.getResults(), map.getResults())) {		for (auto it : llvm::zip(indexMap.getResults(), map.getResults())) {
AffineExpr d0, d1;		AffineExpr d0, d1;
bindDims(read.getContext(), d0, d1);		bindDims(read.getContext(), d0, d1);
auto indexExpr = std::get<0>(it).dyn_cast<AffineDimExpr>();		auto indexExpr = std::get<0>(it).dyn_cast<AffineDimExpr>();
if (!indexExpr)		if (!indexExpr)
▲ Show 20 Lines • Show All 343 Lines • ▼ Show 20 Lines	private:
DistributedReductionFn distributedReductionFn;		DistributedReductionFn distributedReductionFn;
};		};

} // namespace		} // namespace

void mlir::vector::populateWarpExecuteOnLane0OpToScfForPattern(		void mlir::vector::populateWarpExecuteOnLane0OpToScfForPattern(
RewritePatternSet &patterns,		RewritePatternSet &patterns,
const WarpExecuteOnLane0LoweringOptions &options, PatternBenefit benefit) {		const WarpExecuteOnLane0LoweringOptions &options, PatternBenefit benefit) {
patterns.add<WarpOpToScfForPattern>(patterns.getContext(), options, benefit);		patterns.add<WarpOpToScfIfPattern>(patterns.getContext(), options, benefit);
}		}

void mlir::vector::populateDistributeTransferWriteOpPatterns(		void mlir::vector::populateDistributeTransferWriteOpPatterns(
RewritePatternSet &patterns, const DistributionMapFn &distributionMapFn,		RewritePatternSet &patterns, const DistributionMapFn &distributionMapFn,
PatternBenefit benefit) {		PatternBenefit benefit) {
patterns.add<WarpOpTransferWrite>(patterns.getContext(), distributionMapFn,		patterns.add<WarpOpTransferWrite>(patterns.getContext(), distributionMapFn,
benefit);		benefit);
}		}
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

mlir/test/Dialect/Vector/vector-warp-distribute.mlir

Show First 20 Lines • Show All 728 Lines • ▼ Show 20 Lines	func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: vector<f32>, %v1: vector<1xf32>, %v2: vector<1x1xf32>)
// CHECK-SCF-IF: gpu.barrier		// CHECK-SCF-IF: gpu.barrier
// CHECK-SCF-IF: %[[RV2:.]] = vector.transfer_read {{.}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>		// CHECK-SCF-IF: %[[RV2:.]] = vector.transfer_read {{.}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
// CHECK-SCF-IF: %[[RV1:.]] = vector.transfer_read {{.}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>		// CHECK-SCF-IF: %[[RV1:.]] = vector.transfer_read {{.}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
// CHECK-SCF-IF: %[[RV0:.]] = vector.transfer_read {{.}}[]{{.*}} : memref<f32, 3>, vector<f32>		// CHECK-SCF-IF: %[[RV0:.]] = vector.transfer_read {{.}}[]{{.*}} : memref<f32, 3>, vector<f32>
// CHECK-SCF-IF: %[[RS0:.]] = memref.load {{.}}[%[[C0]]] : memref<1xf32, 3>		// CHECK-SCF-IF: %[[RS0:.]] = memref.load {{.}}[%[[C0]]] : memref<1xf32, 3>
// CHECK-SCF-IF: return %[[RS0]], %[[RV0]], %[[RV1]], %[[RV2]] : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>		// CHECK-SCF-IF: return %[[RS0]], %[[RV0]], %[[RV1]], %[[RV2]] : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
return %r#0, %r#1, %r#2, %r#3 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>		return %r#0, %r#1, %r#2, %r#3 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
}		}

		// -----

		// CHECK-SCF-IF-DAG: #[[$TIMES2:.]] = affine_map<()[s0] -> (s0 2)>

		// CHECK-SCF-IF: func @warp_execute_nd_distribute
		// CHECK-SCF-IF-SAME: (%[[LANEID:.*]]: index
		func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %v1: vector<1x2x128xf32>)
		-> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
		// CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index

		// CHECK-SCF-IF: vector.transfer_write %{{.}}, %{{.}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3>
		// CHECK-SCF-IF: %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
		// CHECK-SCF-IF: vector.transfer_write %{{.}}, %{{.}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3>
		// CHECK-SCF-IF: gpu.barrier

		// CHECK-SCF-IF: scf.if{{.*}}{
		%r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
		args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
		^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>):

		// CHECK-SCF-IF-DAG: %[[SR0:.]] = vector.transfer_read %{{.}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<32x64x1xf32>
		// CHECK-SCF-IF-DAG: %[[SR1:.]] = vector.transfer_read %{{.}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x64x128xf32>
		// CHECK-SCF-IF: %[[W0:.*]] = "some_def_0"(%[[SR0]]) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
		// CHECK-SCF-IF: %[[W1:.*]] = "some_def_1"(%[[SR1]]) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>
		// CHECK-SCF-IF-DAG: vector.transfer_write %[[W0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<32x64x1xf32>, memref<32x64x1xf32, 3>
		// CHECK-SCF-IF-DAG: vector.transfer_write %[[W1]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x64x128xf32>, memref<1x64x128xf32, 3>

		%r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
		%r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>

		// CHECK-SCF-IF-NOT: vector.yield
		vector.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32>
		}

		// CHECK-SCF-IF: gpu.barrier
		// CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
		// CHECK-SCF-IF-DAG: %[[R0:.]] = vector.transfer_read %{{.}}[%[[LANEID]], %[[C0]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32>
		// CHECK-SCF-IF-DAG: %[[R1:.]] = vector.transfer_read %{{.}}[%[[C0]], %[[WID]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32>
		// CHECK-SCF-IF: return %[[R0]], %[[R1]] : vector<1x64x1xf32>, vector<1x2x128xf32>
		return %r#0, %r#1 : vector<1x64x1xf32>, vector<1x2x128xf32>
		}