This is an archive of the discontinued LLVM Phabricator instance.

[mlir][vector] Modify constraint and interface for warp reduce on f16 and i8
ClosedPublic

Authored by raikonenfnu on Nov 8 2022, 11:11 PM.

Download Raw Diff

Details

Reviewers

aartbik
nicolasvasilache
dcaballe
ThomasRaoux

Commits

rGd2061530dc09: [mlir][vector] Modify constraint and interface for warp reduce on f16 and i8

Summary

Quantization method is crucial and ubiqutous in accelerating machine
learning workloads. Most of these methods uses f16 and i8 types.

This patch relaxes the type contraints on warp reduce distribution to
allow these types. Furthermore, this patch also changed the interface
and moved the initial reduction of data to a single thread into the
distributedReductionFn, this gives flexibility for developers to control
how they are obtaining the initial lane value, which might differ based
on the input types. (i.e to shuffle 32-width type, we need to reduce f16
to 2xf16 types rather than a single element).

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

raikonenfnu created this revision.Nov 8 2022, 11:11 PM

Herald added a reviewer: aartbik. · View Herald TranscriptNov 8 2022, 11:11 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: Moerafaat, zero9178, bzcheeseman and 21 others. · View Herald Transcript

raikonenfnu requested review of this revision.Nov 8 2022, 11:11 PM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptNov 8 2022, 11:11 PM

Herald added a reviewer: dcaballe. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

Lint code

raikonenfnu added a reviewer: ThomasRaoux.Nov 8 2022, 11:14 PM

Harbormaster completed remote builds in B196831: Diff 474164.Nov 8 2022, 11:36 PM

ThomasRaoux accepted this revision.Nov 9 2022, 11:21 AM

This revision is now accepted and ready to land.Nov 9 2022, 11:21 AM

Closed by commit rGd2061530dc09: [mlir][vector] Modify constraint and interface for warp reduce on f16 and i8 (authored by raikonenfnu). · Explain WhyNov 9 2022, 11:54 AM

This revision was automatically updated to reflect the committed changes.

raikonenfnu added a commit: rGd2061530dc09: [mlir][vector] Modify constraint and interface for warp reduce on f16 and i8.

Revision Contents

Path

Size

mlir/

lib/

Dialect/

Vector/

Transforms/

VectorDistribute.cpp

17 lines

test/

lib/

Dialect/

Vector/

TestVectorTransforms.cpp

3 lines

Diff 474323

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp

Show First 20 Lines • Show All 1,129 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
// Only rank 1 vectors supported.		// Only rank 1 vectors supported.
if (vectorType.getRank() != 1)		if (vectorType.getRank() != 1)
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
warpOp, "Only rank 1 reductions can be distributed.");		warpOp, "Only rank 1 reductions can be distributed.");
// Only warp_size-sized vectors supported.		// Only warp_size-sized vectors supported.
if (vectorType.getShape()[0] % warpOp.getWarpSize() != 0)		if (vectorType.getShape()[0] % warpOp.getWarpSize() != 0)
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
warpOp, "Reduction vector dimension must match was size.");		warpOp, "Reduction vector dimension must match was size.");
// Only f32 and i32 element types are supported.		// Only f32, i32, f16, i8 element types are supported.
if (!reductionOp.getType().isF32() &&		if (!reductionOp.getType().isF32() &&
!reductionOp.getType().isSignlessInteger(32))		!reductionOp.getType().isSignlessInteger(32) &&
		!reductionOp.getType().isF16() && !reductionOp.getType().isInteger(8))
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
warpOp,		warpOp, "Reduction distribution currently only supports 32bits, f16, "
"Reduction distribution currently only supports 32bits types.");		"and i8 types.");

int64_t numElements = vectorType.getShape()[0] / warpOp.getWarpSize();		int64_t numElements = vectorType.getShape()[0] / warpOp.getWarpSize();
// Return vector that will be reduced from the WarpExecuteOnLane0Op.		// Return vector that will be reduced from the WarpExecuteOnLane0Op.
unsigned operandIndex = yieldOperand->getOperandNumber();		unsigned operandIndex = yieldOperand->getOperandNumber();
SmallVector<Value> yieldValues = {reductionOp.getVector()};		SmallVector<Value> yieldValues = {reductionOp.getVector()};
SmallVector<Type> retTypes = {		SmallVector<Type> retTypes = {
VectorType::get({numElements}, reductionOp.getType())};		VectorType::get({numElements}, reductionOp.getType())};
if (reductionOp.getAcc()) {		if (reductionOp.getAcc()) {
yieldValues.push_back(reductionOp.getAcc());		yieldValues.push_back(reductionOp.getAcc());
retTypes.push_back(reductionOp.getAcc().getType());		retTypes.push_back(reductionOp.getAcc().getType());
}		}
SmallVector<size_t> newRetIndices;		SmallVector<size_t> newRetIndices;
WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(		WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, yieldValues, retTypes, newRetIndices);		rewriter, warpOp, yieldValues, retTypes, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);		rewriter.setInsertionPointAfter(newWarpOp);

		// Obtain data to reduce for a single lane.
Value laneValVec = newWarpOp.getResult(newRetIndices[0]);		Value laneValVec = newWarpOp.getResult(newRetIndices[0]);
// First reduce on a single thread.		// Distribute and reduce across threads.
Value perLaneReduction = rewriter.create<vector::ReductionOp>(
reductionOp.getLoc(), reductionOp.getKind(), laneValVec);
// Then distribute across threads.
Value fullReduce =		Value fullReduce =
distributedReductionFn(reductionOp.getLoc(), rewriter, perLaneReduction,		distributedReductionFn(reductionOp.getLoc(), rewriter, laneValVec,
reductionOp.getKind(), newWarpOp.getWarpSize());		reductionOp.getKind(), newWarpOp.getWarpSize());
if (reductionOp.getAcc()) {		if (reductionOp.getAcc()) {
fullReduce = vector::makeArithReduction(		fullReduce = vector::makeArithReduction(
rewriter, reductionOp.getLoc(), reductionOp.getKind(), fullReduce,		rewriter, reductionOp.getLoc(), reductionOp.getKind(), fullReduce,
newWarpOp.getResult(newRetIndices[1]));		newWarpOp.getResult(newRetIndices[1]));
}		}
newWarpOp.getResult(operandIndex).replaceAllUsesWith(fullReduce);		newWarpOp.getResult(operandIndex).replaceAllUsesWith(fullReduce);
return success();		return success();
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp

Show First 20 Lines • Show All 680 Lines • ▼ Show 20 Lines	static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
global->moveBefore(&moduleOp.front());		global->moveBefore(&moduleOp.front());

builder.restoreInsertionPoint(ip);		builder.restoreInsertionPoint(ip);
return builder.create<memref::GetGlobalOp>(loc, memrefType, symbolName);		return builder.create<memref::GetGlobalOp>(loc, memrefType, symbolName);
}		}

static Value warpReduction(Location loc, OpBuilder &builder, Value input,		static Value warpReduction(Location loc, OpBuilder &builder, Value input,
CombiningKind kind, uint32_t size) {		CombiningKind kind, uint32_t size) {
Value laneVal = input;		// First reduce on a single thread to get per lane reduction value.
		Value laneVal = builder.create<vector::ReductionOp>(loc, kind, input);
// Parallel reduction using butterfly shuffles.		// Parallel reduction using butterfly shuffles.
for (uint64_t i = 1; i < size; i <<= 1) {		for (uint64_t i = 1; i < size; i <<= 1) {
Value shuffled = builder		Value shuffled = builder
.create<gpu::ShuffleOp>(loc, laneVal, i,		.create<gpu::ShuffleOp>(loc, laneVal, i,
/width=/size,		/width=/size,
/mode=/gpu::ShuffleMode::XOR)		/mode=/gpu::ShuffleMode::XOR)
.getShuffleResult();		.getShuffleResult();
laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);		laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
▲ Show 20 Lines • Show All 126 Lines • Show Last 20 Lines