This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/lib/Dialect/Linalg/Transforms/
-
lib/
-
Dialect/
-
Linalg/
-
Transforms/
-
ElementwiseOpFusion.cpp

Differential D112099

[mlir][linalg] Fix FoldConstantTranspose execution inefficiency
ClosedPublic

Authored by antiagainst on Oct 19 2021, 3:15 PM.

Download Raw Diff

Details

Reviewers

mravishankar
nicolasvasilache
rriddle

Commits

rGc788cad83b6b: [mlir][linalg] Fix FoldConstantTranspose execution inefficiency

Summary

Move SmallVectors outside of inner loops to avoid frequent allocations and deallocations
Calculate linearized index and call flat range getters to avoid internal shape querying behind getValue.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

antiagainst created this revision.Oct 19 2021, 3:15 PM

Herald added a reviewer: mravishankar. · View Herald TranscriptOct 19 2021, 3:15 PM

Herald added subscribers: wenzhicui, wrengr, Chia-hungDuan and 19 others. · View Herald Transcript

antiagainst requested review of this revision.Oct 19 2021, 3:15 PM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptOct 19 2021, 3:15 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: limo1996, stephenneuendorffer, nicolasvasilache. · View Herald Transcript

antiagainst added a reviewer: rriddle.Oct 19 2021, 3:16 PM

Harbormaster completed remote builds in B129615: Diff 380788.Oct 19 2021, 3:30 PM

Split int/float cases entirely to avoid getters in the loop

Harbormaster completed remote builds in B129759: Diff 380993.Oct 20 2021, 10:10 AM

Thanks!

This revision is now accepted and ready to land.Oct 26 2021, 3:51 PM

Closed by commit rGc788cad83b6b: [mlir][linalg] Fix FoldConstantTranspose execution inefficiency (authored by antiagainst). · Explain WhyOct 28 2021, 6:49 AM

This revision was automatically updated to reflect the committed changes.

antiagainst added a commit: rGc788cad83b6b: [mlir][linalg] Fix FoldConstantTranspose execution inefficiency.

Revision Contents

Path

Size

mlir/

lib/

Dialect/

Linalg/

Transforms/

ElementwiseOpFusion.cpp

126 lines

Diff 383023

mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp

Show First 20 Lines • Show All 1,280 Lines • ▼ Show 20 Lines
/// ```		/// ```
///		///
/// The latter inspects the region and returns the computation inside as a		/// The latter inspects the region and returns the computation inside as a
/// functor. The functor will be invoked with constant elements for all inputs		/// functor. The functor will be invoked with constant elements for all inputs
/// and should return the corresponding computea constant element for output.		/// and should return the corresponding computea constant element for output.
template <typename ConcreteType>		template <typename ConcreteType>
class FoldConstantBase : public OpRewritePattern<GenericOp> {		class FoldConstantBase : public OpRewritePattern<GenericOp> {
public:		public:
		struct APIntOrFloat {
		Optional<APInt> apInt;
		Optional<APFloat> apFloat;
		};
struct APIntOrFloatArray {		struct APIntOrFloatArray {
SmallVector<APInt> apInts;		SmallVector<APInt> apInts;
SmallVector<APFloat> apFloats;		SmallVector<APFloat> apFloats;
};		};
using RegionComputationFn =		using RegionComputationFn =
std::function<APIntOrFloatArray(APIntOrFloatArray)>;		std::function<APIntOrFloat(const APIntOrFloatArray &)>;

FoldConstantBase(MLIRContext *context,		FoldConstantBase(MLIRContext *context,
const ControlElementwiseOpsFusionFn &controlFn,		const ControlElementwiseOpsFusionFn &controlFn,
PatternBenefit benefit = 1)		PatternBenefit benefit = 1)
: OpRewritePattern<GenericOp>(context, benefit), controlFn(controlFn) {}		: OpRewritePattern<GenericOp>(context, benefit), controlFn(controlFn) {}

LogicalResult matchAndRewrite(GenericOp genericOp,		LogicalResult matchAndRewrite(GenericOp genericOp,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
▲ Show 20 Lines • Show All 95 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(GenericOp genericOp,
};		};

SmallVector<SmallVector<unsigned>> inputDims;		SmallVector<SmallVector<unsigned>> inputDims;
for (int i = 0; i < numInputs; ++i)		for (int i = 0; i < numInputs; ++i)
inputDims.push_back(getDimPositions(genericOp.getIndexingMaps()[i]));		inputDims.push_back(getDimPositions(genericOp.getIndexingMaps()[i]));
auto outputDims = getDimPositions(genericOp.getIndexingMaps().back());		auto outputDims = getDimPositions(genericOp.getIndexingMaps().back());
auto outputShape = outputType.getShape();		auto outputShape = outputType.getShape();

// Transpose the input constant. Because we don't know its rank in advance,		// Allocate small vectors for index delinearization. Initial values do not
// we need to loop over the range [0, element count) and delinearize the		// matter here as they will be overwritten later.
// index.
for (int linearIndex0 = 0; linearIndex0 < numElements; ++linearIndex0) {
SmallVector<uint64_t> indices(loopBounds.size(), 0);		SmallVector<uint64_t> indices(loopBounds.size(), 0);
int totalCount = linearIndex0;		SmallVector<uint64_t> dstIndices(loopBounds.size(), 0);
		SmallVector<SmallVector<uint64_t>> srcIndices(
		numInputs, SmallVector<uint64_t>(loopBounds.size(), 0));
		SmallVector<uint64_t> srcLinearIndices(numInputs, 0);
		uint64_t dstLinearIndex = 0;

		// Allocate spaces for compute function inputs. Initial values do not matter
		// here as they will be overwritten later.
		APIntOrFloatArray computeFnInputs;

		auto inputShapes = llvm::to_vector<4>(
		llvm::map_range(genericOp.getInputOperands(), [](OpOperand *operand) {
		return operand->get().getType().cast<ShapedType>().getShape();
		}));

		// Given a `linearIndex`, remap it to a linear index to access linalg op
		// inputs/ouputs. This mutates `indices`, `srcIndices`, `dstIndices`,
		// `srcLinearIndices`, `dstLinearIndex` in place.
		auto computeRemappedLinearIndex = [&](int linearIndex) {
		int totalCount = linearIndex;
for (int dim = loopBounds.size() - 1; dim >= 0; --dim) {		for (int dim = loopBounds.size() - 1; dim >= 0; --dim) {
indices[dim] = totalCount % loopBounds[dim];		indices[dim] = totalCount % loopBounds[dim];
totalCount /= loopBounds[dim];		totalCount /= loopBounds[dim];
}		}

SmallVector<SmallVector<uint64_t>> srcIndices;
for (int i = 0; i < numInputs; ++i)
srcIndices.emplace_back(loopBounds.size(), 0);
SmallVector<uint64_t> dstIndices(loopBounds.size(), 0);

for (int dim = loopBounds.size() - 1; dim >= 0; --dim) {		for (int dim = loopBounds.size() - 1; dim >= 0; --dim) {
for (int i = 0; i < numInputs; ++i)		for (int i = 0; i < numInputs; ++i)
srcIndices[i][dim] = indices[inputDims[i][dim]];		srcIndices[i][dim] = indices[inputDims[i][dim]];
dstIndices[dim] = indices[outputDims[dim]];		dstIndices[dim] = indices[outputDims[dim]];
}		}

uint64_t linearIndex1 = dstIndices.front();		dstLinearIndex = dstIndices.front();
for (int dim = 1; dim < outputType.getRank(); ++dim)		for (int i = 0; i < numInputs; ++i)
linearIndex1 = linearIndex1 * outputShape[dim] + dstIndices[dim];		srcLinearIndices[i] = srcIndices[i].front();

// Collect constant elements for all inputs at this loop iteration.		for (int dim = 1; dim < outputType.getRank(); ++dim) {
SmallVector<APInt> intValues;		dstLinearIndex = dstLinearIndex * outputShape[dim] + dstIndices[dim];
SmallVector<APFloat> fpValues;
if (elementType.isa<FloatType>()) {
for (int i = 0; i < numInputs; ++i)		for (int i = 0; i < numInputs; ++i)
fpValues.push_back(inputValues[i].getValue<APFloat>(srcIndices[i]));		srcLinearIndices[i] =
} else {		srcLinearIndices[i] * inputShapes[i][dim] + srcIndices[i][dim];
		}
		};

		bool isFloat = elementType.isa<FloatType>();
		if (isFloat) {
		SmallVector<iterator_range<DenseElementsAttr::FloatElementIterator>>
		inputFpIterators;
for (int i = 0; i < numInputs; ++i)		for (int i = 0; i < numInputs; ++i)
intValues.push_back(inputValues[i].getValue<APInt>(srcIndices[i]));		inputFpIterators.push_back(inputValues[i].getValues<APFloat>());

		computeFnInputs.apFloats.resize(numInputs, APFloat(0.f));

		// Transpose the input constant. Because we don't know its rank in
		// advance, we need to loop over the range [0, element count) and
		// delinearize the index.
		for (int linearIndex = 0; linearIndex < numElements; ++linearIndex) {
		computeRemappedLinearIndex(linearIndex);

		// Collect constant elements for all inputs at this loop iteration.
		for (int i = 0; i < numInputs; ++i) {
		computeFnInputs.apFloats[i] =
		*(inputFpIterators[i].begin() + srcLinearIndices[i]);
}		}

// Invoke the computation to get the corresponding constant output		// Invoke the computation to get the corresponding constant output
// element.		// element.
APIntOrFloatArray inputs = {intValues, fpValues};		APIntOrFloat outputs = computeFn(computeFnInputs);
APIntOrFloatArray outputs = computeFn(inputs);

if (elementType.isa<FloatType>()) {		fpOutputValues[dstLinearIndex] = outputs.apFloat.getValue();
fpOutputValues[linearIndex1] = outputs.apFloats.front();		}
} else {		} else {
intOutputValues[linearIndex1] = outputs.apInts.front();		SmallVector<iterator_range<DenseElementsAttr::IntElementIterator>>
		inputIntIterators;
		for (int i = 0; i < numInputs; ++i)
		inputIntIterators.push_back(inputValues[i].getValues<APInt>());

		computeFnInputs.apInts.resize(numInputs);

		// Transpose the input constant. Because we don't know its rank in
		// advance, we need to loop over the range [0, element count) and
		// delinearize the index.
		for (int linearIndex = 0; linearIndex < numElements; ++linearIndex) {
		computeRemappedLinearIndex(linearIndex);

		// Collect constant elements for all inputs at this loop iteration.
		for (int i = 0; i < numInputs; ++i) {
		computeFnInputs.apInts[i] =
		*(inputIntIterators[i].begin() + srcLinearIndices[i]);
		}

		// Invoke the computation to get the corresponding constant output
		// element.
		APIntOrFloat outputs = computeFn(computeFnInputs);

		intOutputValues[dstLinearIndex] = outputs.apInt.getValue();
}		}
}		}

DenseIntOrFPElementsAttr outputAttr;		DenseIntOrFPElementsAttr outputAttr;
if (elementType.isa<FloatType>()) {		if (isFloat) {
outputAttr = DenseFPElementsAttr::get(outputType, fpOutputValues);		outputAttr = DenseFPElementsAttr::get(outputType, fpOutputValues);
} else {		} else {
outputAttr = DenseIntElementsAttr::get(outputType, intOutputValues);		outputAttr = DenseIntElementsAttr::get(outputType, intOutputValues);
}		}
rewriter.replaceOpWithNewOp<ConstantOp>(genericOp, outputAttr);		rewriter.replaceOpWithNewOp<ConstantOp>(genericOp, outputAttr);
return success();		return success();
}		}

Show All 24 Lines	for (Value yieldVal : yieldOp.values()) {
auto yieldArg = yieldVal.dyn_cast<BlockArgument>();		auto yieldArg = yieldVal.dyn_cast<BlockArgument>();
if (!yieldArg \|\| yieldArg.getOwner() != &body)		if (!yieldArg \|\| yieldArg.getOwner() != &body)
return nullptr;		return nullptr;
if (yieldArg.getArgNumber() != 0)		if (yieldArg.getArgNumber() != 0)
return nullptr;		return nullptr;
}		}

// No computation; just return the orginal value.		// No computation; just return the orginal value.
return [](APIntOrFloatArray inputs) { return inputs; };		return [](const APIntOrFloatArray &inputs) {
		if (inputs.apFloats.empty())
		return APIntOrFloat{inputs.apInts.front(), llvm::None};
		return APIntOrFloat{llvm::None, inputs.apFloats.front()};
		};
}		}

ControlElementwiseOpsFusionFn controlFn;		ControlElementwiseOpsFusionFn controlFn;
};		};

} // namespace		} // namespace

static Optional<SmallVector<Value>>		static Optional<SmallVector<Value>>
▲ Show 20 Lines • Show All 197 Lines • Show Last 20 Lines