Diff 429841

mlir/include/mlir/Conversion/Passes.td

Show First 20 Lines • Show All 720 Lines • ▼ Show 20 Lines	let description = [{
Pass that converts TOSA operations to the equivalent operations using the		Pass that converts TOSA operations to the equivalent operations using the
operations in the Arith dialect. The ApplyScale operator is optionally		operations in the Arith dialect. The ApplyScale operator is optionally
included as it is often preserved until the final invocation.		included as it is often preserved until the final invocation.
}];		}];

let options = [		let options = [
Option<"includeApplyRescale", "include-apply-rescale",		Option<"includeApplyRescale", "include-apply-rescale",
"bool", /default=/"false",		"bool", /default=/"false",
"Whether to include the lowering for tosa.apply_rescale to arith">		"Whether to include the lowering for tosa.apply_rescale to arith">,
		Option<"use32Bit", "use-32-bit",
		"bool", /default=/"false",
		"Whether to prioritze lowering to 32-bit operations">
];		];

let constructor = "tosa::createTosaToArith()";		let constructor = "tosa::createTosaToArith()";
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// TosaToLinalg		// TosaToLinalg
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
▲ Show 20 Lines • Show All 168 Lines • Show Last 20 Lines

mlir/include/mlir/Conversion/TosaToArith/TosaToArith.h

	Show All 16 Lines

	namespace mlir {			namespace mlir {
	namespace tosa {			namespace tosa {

	std::unique_ptr<Pass> createTosaToArith();			std::unique_ptr<Pass> createTosaToArith();

	void populateTosaToArithConversionPatterns(RewritePatternSet *patterns);			void populateTosaToArithConversionPatterns(RewritePatternSet *patterns);

	void populateTosaRescaleToArithConversionPatterns(RewritePatternSet *patterns);			void populateTosaRescaleToArithConversionPatterns(RewritePatternSet *patterns,
				bool use32Bit);

	} // namespace tosa			} // namespace tosa
	} // namespace mlir			} // namespace mlir

	#endif // MLIR_CONVERSION_TOSATOARITH_TOSATOARITH_H			#endif // MLIR_CONVERSION_TOSATOARITH_TOSATOARITH_H

mlir/lib/Conversion/TosaToArith/TosaToArith.cpp

//===- TosaToArith.cpp - Lowering Tosa to Arith Dialect -------------===//		//===- TosaToArith.cpp - Lowering Tosa to Arith Dialect -------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// These rewriters lower from the Tosa to the Arith dialect.		// These rewriters lower from the Tosa to the Arith dialect.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Conversion/TosaToArith/TosaToArith.h"		#include "mlir/Conversion/TosaToArith/TosaToArith.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"		#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Tosa/IR/TosaOps.h"		#include "mlir/Dialect/Tosa/IR/TosaOps.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
		#include "mlir/IR/TypeUtilities.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

using namespace mlir;		using namespace mlir;
using namespace tosa;		using namespace tosa;

namespace {		namespace {

class ConstOpConverter : public OpRewritePattern<tosa::ConstOp> {		class ConstOpConverter : public OpRewritePattern<tosa::ConstOp> {
Show All 19 Lines	if (auto shapedTy = type.dyn_cast<ShapedType>()) {
Type eTy = shapedTy.getElementType();		Type eTy = shapedTy.getElementType();
APInt valueInt(eTy.getIntOrFloatBitWidth(), value);		APInt valueInt(eTy.getIntOrFloatBitWidth(), value);
return DenseIntElementsAttr::get(shapedTy, valueInt);		return DenseIntElementsAttr::get(shapedTy, valueInt);
}		}

return rewriter.getIntegerAttr(type, value);		return rewriter.getIntegerAttr(type, value);
}		}

		Value getConstantValue(Location loc, Type type, int64_t value,
		PatternRewriter &rewriter) {
		return rewriter.create<arith::ConstantOp>(
		loc, getConstantAttr(type, value, rewriter));
		}

// This converts the TOSA ApplyScale operator to a set of arithmetic ops,		// This converts the TOSA ApplyScale operator to a set of arithmetic ops,
// using 64-bit operations to perform the necessary multiply, bias, and shift.		// using 64-bit operations to perform the necessary multiply, bias, and shift.
// Multiple types are used to use minimal bit width operations.		class ApplyScaleGenericOpConverter
class ApplyScaleOpConverter : public OpRewritePattern<tosa::ApplyScaleOp> {		: public OpRewritePattern<tosa::ApplyScaleOp> {
		mravishankarUnsubmitted Done Reply Inline Actions Why is this `ApplyScale48OpConverter`? mravishankar: Why is this `ApplyScale48OpConverter`?
		rsudermanAuthorUnsubmitted Done Reply Inline Actions 48 meant that it was meant for 48-bit conversion. Because we want to use this for 32-bit operations in some cases I changed it to `Generic.` rsuderman: 48 meant that it was meant for 48-bit conversion. Because we want to use this for 32-bit…
public:		public:
using OpRewritePattern<tosa::ApplyScaleOp>::OpRewritePattern;		ApplyScaleGenericOpConverter(MLIRContext *context, bool convertLowBit)
		: OpRewritePattern<tosa::ApplyScaleOp>(context) {
		this->convertLowBit = convertLowBit;
		}

LogicalResult matchAndRewrite(tosa::ApplyScaleOp op,		LogicalResult matchAndRewrite(tosa::ApplyScaleOp op,
PatternRewriter &rewriter) const final {		PatternRewriter &rewriter) const final {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value value32 = op.value();		Value value = op.value();
Value multiplier32 = op.multiplier();		Value multiplier32 = op.multiplier();
Value shift8 = op.shift();

bool doubleRound = op.double_round();
Type inType = op.value().getType();
Type resultTy = op.getType();		Type resultTy = op.getType();
		Type valueTy = value.getType();
Type i8Ty = matchContainerType(rewriter.getIntegerType(8), resultTy);
Type i32Ty = matchContainerType(rewriter.getI32Type(), resultTy);		Type i32Ty = matchContainerType(rewriter.getI32Type(), resultTy);
Type i64Ty = matchContainerType(rewriter.getI64Type(), resultTy);		Type i64Ty = matchContainerType(rewriter.getI64Type(), resultTy);

Value one8 = rewriter.create<arith::ConstantOp>(		Value zero = getConstantValue(loc, valueTy, 0, rewriter);
loc, getConstantAttr(i8Ty, 1, rewriter));		Value one64 = getConstantValue(loc, i64Ty, 1, rewriter);
Value one64 = rewriter.create<arith::ConstantOp>(		Value thirtyOne32 = getConstantValue(loc, i32Ty, 31, rewriter);
loc, getConstantAttr(i64Ty, 1, rewriter));
		// Check whether this lowering should be use for low-bit conversions. If not
Value shiftSubOne8 = rewriter.create<arith::SubIOp>(loc, shift8, one8);		// the 32-bit version should be included.
		if (getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth() <= 32 &&
// The rounding value semantics below equate to the following code:		!convertLowBit)
// int64_t round = 1 << (shift - 1);		return failure();
// if (double_round) {
// if (shift > 31 && value >= 0) round += 1<<30;
// if (shift > 31 && value < 0) round -= 1<<30;
// }
//
// Note that minimal bitwidth operators are used throughout the block.

Value round64 = rewriter.create<arith::ShLIOp>(		Value shift32 = rewriter.create<arith::ExtUIOp>(loc, i32Ty, op.shift());
loc, one64, rewriter.create<arith::ExtSIOp>(loc, i64Ty, shiftSubOne8));

// Double rounding is performing a round operation before the shift		// Compute the multiplication in 64-bits then select the high / low parts.
if (doubleRound) {		Value value64 = rewriter.create<arith::ExtSIOp>(loc, i64Ty, value);
Value one32 = rewriter.create<arith::ConstantOp>(		Value multiplier64 =
loc, getConstantAttr(i32Ty, 1, rewriter));		rewriter.create<arith::ExtSIOp>(loc, i64Ty, multiplier32);
Value shift32 = rewriter.create<arith::ExtSIOp>(loc, i32Ty, shift8);		Value multiply64 =
Value thirty32 = rewriter.create<arith::ConstantOp>(		rewriter.create<arith::MulIOp>(loc, value64, multiplier64);
loc, getConstantAttr(i32Ty, 30, rewriter));

Value shiftThirty32 =
rewriter.create<arith::ShLIOp>(loc, one32, thirty32);
Value shiftThirty64 =
rewriter.create<arith::ExtSIOp>(loc, i64Ty, shiftThirty32);

// Round value needs to with be added or subtracted depending on the sign
// of the input value.
Value roundAdd64 =
rewriter.create<arith::AddIOp>(loc, round64, shiftThirty64);
Value roundSub64 =
rewriter.create<arith::SubIOp>(loc, round64, shiftThirty64);

Value zero32 =
rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(inType));
Value valueGreaterThanZero = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::sge, value32, zero32);

Value doubleRound64 = rewriter.create<arith::SelectOp>(		// Apply normal rounding.
loc, valueGreaterThanZero, roundAdd64, roundSub64);		Value shift64 = rewriter.create<arith::ExtUIOp>(loc, i64Ty, shift32);
		Value round = rewriter.create<arith::ShLIOp>(loc, one64, shift64);
		round = rewriter.create<arith::ShRUIOp>(loc, round, one64);
		multiply64 = rewriter.create<arith::AddIOp>(loc, multiply64, round);

		// Apply double rounding if necessary.
		if (op.double_round()) {
		int64_t roundInt = 1 << 30;
		Value roundUp = getConstantValue(loc, i64Ty, roundInt, rewriter);
		Value roundDown = getConstantValue(loc, i64Ty, -roundInt, rewriter);
		Value positive = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::sge, value, zero);
		Value dir =
		rewriter.create<arith::SelectOp>(loc, positive, roundUp, roundDown);
		Value val = rewriter.create<arith::AddIOp>(loc, dir, multiply64);
		Value valid = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::sgt, shift32, thirtyOne32);
		multiply64 =
		rewriter.create<arith::SelectOp>(loc, valid, val, multiply64);
		}

// We only perform double rounding if the shift value is greater than 32.		Value result64 = rewriter.create<arith::ShRSIOp>(loc, multiply64, shift64);
Value thirtyTwo32 = rewriter.create<arith::ConstantOp>(		Value result32 = rewriter.create<arith::TruncIOp>(loc, i32Ty, result64);
loc, getConstantAttr(i32Ty, 32, rewriter));
Value shiftGreaterThanThirtyTwo = rewriter.create<arith::CmpIOp>(		rewriter.replaceOp(op, result32);
loc, arith::CmpIPredicate::sge, shift32, thirtyTwo32);		return success();
round64 = rewriter.create<arith::SelectOp>(loc, shiftGreaterThanThirtyTwo,
doubleRound64, round64);
}		}

// The computation below equates to the following pseudocode:		private:
// int64_t result = (int64_t)value * multiplier + round;		bool convertLowBit;
// result = result >> shift;		};
//
// Note that multiply and shift need to be perform in i64 to preserve bits.		class ApplyScale32OpConverter : public OpRewritePattern<tosa::ApplyScaleOp> {
		public:
		using OpRewritePattern<tosa::ApplyScaleOp>::OpRewritePattern;

		LogicalResult matchAndRewrite(tosa::ApplyScaleOp op,
		PatternRewriter &rewriter) const final {
		Location loc = op.getLoc();

		Type resultTy = op.getType();
		Type i32Ty = matchContainerType(rewriter.getI32Type(), resultTy);
		Type i64Ty = matchContainerType(rewriter.getI64Type(), resultTy);

		Value value = op.value();
		if (getElementTypeOrSelf(value.getType()).getIntOrFloatBitWidth() > 32) {
		return failure();
		}

		Value value32 = op.value();
		Value multiplier32 = op.multiplier();
		Value shift32 = rewriter.create<arith::ExtUIOp>(loc, i32Ty, op.shift());

		// Constants used during the scaling operation.
		Value zero32 = getConstantValue(loc, i32Ty, 0, rewriter);
		Value one32 = getConstantValue(loc, i32Ty, 1, rewriter);
		Value two32 = getConstantValue(loc, i32Ty, 2, rewriter);
		Value thirty32 = getConstantValue(loc, i32Ty, 30, rewriter);
		Value thirtyTwo32 = getConstantValue(loc, i32Ty, 32, rewriter);
		Value thirtyTwo64 = getConstantValue(loc, i64Ty, 32, rewriter);

		// Compute the multiplication in 64-bits then select the high / low parts.
Value value64 = rewriter.create<arith::ExtSIOp>(loc, i64Ty, value32);		Value value64 = rewriter.create<arith::ExtSIOp>(loc, i64Ty, value32);
Value multiplier64 =		Value multiplier64 =
rewriter.create<arith::ExtSIOp>(loc, i64Ty, multiplier32);		rewriter.create<arith::ExtSIOp>(loc, i64Ty, multiplier32);
Value shift64 = rewriter.create<arith::ExtSIOp>(loc, i64Ty, shift8);		Value multiply64 =
		rewriter.create<arith::MulIOp>(loc, value64, multiplier64);

// Multiply as a pair of i64 values to guarantee the end value fits.		// Grab out the high/low of the computation
Value result64 = rewriter.create<arith::MulIOp>(loc, value64, multiplier64);		Value high64 =
result64 = rewriter.create<arith::AddIOp>(loc, result64, round64);		rewriter.create<arith::ShRUIOp>(loc, multiply64, thirtyTwo64);
result64 = rewriter.create<arith::ShRSIOp>(loc, result64, shift64);		Value high32 = rewriter.create<arith::TruncIOp>(loc, i32Ty, high64);
		Value low32 = rewriter.create<arith::MulIOp>(loc, value32, multiplier32);

Value result32 = rewriter.create<arith::TruncIOp>(loc, resultTy, result64);		// Determine the direction and amount to shift the high bits.
		Value shiftOver32 = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::sge, shift32, thirtyTwo32);
		Value roundHighBits = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::sgt, shift32, thirtyTwo32);

rewriter.replaceOp(op, result32);		Value shiftHighL =
		rewriter.create<arith::SubIOp>(loc, thirtyTwo32, shift32);
		Value shiftHighR =
		rewriter.create<arith::SubIOp>(loc, shift32, thirtyTwo32);

		shiftHighL =
		rewriter.create<arith::SelectOp>(loc, shiftOver32, zero32, shiftHighL);
		shiftHighR =
		rewriter.create<arith::SelectOp>(loc, shiftOver32, shiftHighR, zero32);

		// Conditionally perform our double round.
		if (op.double_round()) {
		Value negOne32 = getConstantValue(loc, i32Ty, -1, rewriter);
		Value valuePositive = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::sge, value32, zero32);

		Value roundDir =
		rewriter.create<arith::SelectOp>(loc, valuePositive, one32, negOne32);
		roundDir =
		rewriter.create<arith::SelectOp>(loc, shiftOver32, roundDir, zero32);

		Value shiftLow = rewriter.create<arith::ShRUIOp>(loc, low32, thirty32);
		Value rounded = rewriter.create<arith::AddIOp>(loc, shiftLow, roundDir);
		Value carry = rewriter.create<arith::ShRSIOp>(loc, rounded, two32);

		Value shiftRound =
		rewriter.create<arith::ShLIOp>(loc, roundDir, thirty32);

		low32 = rewriter.create<arith::AddIOp>(loc, low32, shiftRound);
		high32 = rewriter.create<arith::AddIOp>(loc, high32, carry);
		}

		// Conditionally apply rounding in the low bits.
		{
		Value shiftSubOne = rewriter.create<arith::SubIOp>(loc, shift32, one32);
		Value roundBit = rewriter.create<arith::ShLIOp>(loc, one32, shiftSubOne);
		roundBit = rewriter.create<arith::SelectOp>(loc, roundHighBits, zero32,
		roundBit);

		Value newLow32 = rewriter.create<arith::AddIOp>(loc, low32, roundBit);
		Value wasRounded = rewriter.create<arith::CmpIOp>(
		loc, arith::CmpIPredicate::ugt, low32, newLow32);
		low32 = newLow32;

		Value rounded32 = rewriter.create<arith::ExtUIOp>(loc, i32Ty, wasRounded);
		high32 = rewriter.create<arith::AddIOp>(loc, high32, rounded32);
		}

		// Conditionally apply rounding in the high bits.
		{
		Value shiftSubOne =
		rewriter.create<arith::SubIOp>(loc, shiftHighR, one32);
		Value roundBit = rewriter.create<arith::ShLIOp>(loc, one32, shiftSubOne);
		roundBit = rewriter.create<arith::SelectOp>(loc, roundHighBits, roundBit,
		zero32);
		high32 = rewriter.create<arith::AddIOp>(loc, high32, roundBit);
		}

		// Combine the correct high/low bits into the final rescale result.
		high32 = rewriter.create<arith::ShLIOp>(loc, high32, shiftHighL);
		high32 = rewriter.create<arith::ShRSIOp>(loc, high32, shiftHighR);
		low32 = rewriter.create<arith::ShRUIOp>(loc, low32, shift32);
		low32 = rewriter.create<arith::SelectOp>(loc, shiftOver32, zero32, low32);

		// Apply the rounding behavior and shift to the final alignment.
		Value result = rewriter.create<arith::AddIOp>(loc, low32, high32);

		// Truncate if necessary.
		if (!getElementTypeOrSelf(resultTy).isInteger(32)) {
		result = rewriter.create<arith::TruncIOp>(loc, resultTy, result);
		}

		rewriter.replaceOp(op, result);
return success();		return success();
}		}
};		};

} // namespace		} // namespace

void mlir::tosa::populateTosaToArithConversionPatterns(		void mlir::tosa::populateTosaToArithConversionPatterns(
RewritePatternSet *patterns) {		RewritePatternSet *patterns) {
patterns->add<ConstOpConverter>(patterns->getContext());		patterns->add<ConstOpConverter>(patterns->getContext());
}		}

void mlir::tosa::populateTosaRescaleToArithConversionPatterns(		void mlir::tosa::populateTosaRescaleToArithConversionPatterns(
RewritePatternSet *patterns) {		RewritePatternSet *patterns, bool use32Bit) {
patterns->add<ApplyScaleOpConverter>(patterns->getContext());		patterns->add<ApplyScaleGenericOpConverter>(patterns->getContext(),
		!use32Bit);
		if (use32Bit) {
		patterns->add<ApplyScale32OpConverter>(patterns->getContext());
		}
		mravishankarUnsubmitted Done Reply Inline Actions This probably needs to be added with a higher benefit. If both this pattern and the non-32 bit patterns are added by a caller, then it is undefined which will trigger first. That can lead to strange compilation. mravishankar: This probably needs to be added with a higher benefit. If both this pattern and the non-32 bit…
		dcaballeUnsubmitted Done Reply Inline Actions +1. Nit: Alternatively, we could have a single populate function with a flag to enable the 32-bit lowering. That would keep both related patterns and their priorities in a single function and avoid potential misuses. dcaballe: +1. Nit: Alternatively, we could have a single populate function with a flag to enable the 32…
		rsudermanAuthorUnsubmitted Done Reply Inline Actions Updated to include benefit values and just a bool to decide whether to include the 32-bit lowering path. Note that on llvm integrate we are going to need an integration fix. rsuderman: Updated to include benefit values and just a bool to decide whether to include the 32-bit…
}		}

mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp

Show All 30 Lines	void runOnOperation() override {
RewritePatternSet patterns(&getContext());		RewritePatternSet patterns(&getContext());
ConversionTarget target(getContext());		ConversionTarget target(getContext());
target.addIllegalOp<tosa::ConstOp>();		target.addIllegalOp<tosa::ConstOp>();
target.addLegalDialect<arith::ArithmeticDialect>();		target.addLegalDialect<arith::ArithmeticDialect>();

mlir::tosa::populateTosaToArithConversionPatterns(&patterns);		mlir::tosa::populateTosaToArithConversionPatterns(&patterns);

if (this->includeApplyRescale) {		if (this->includeApplyRescale) {
mlir::tosa::populateTosaRescaleToArithConversionPatterns(&patterns);		mlir::tosa::populateTosaRescaleToArithConversionPatterns(&patterns,
		this->use32Bit);
		mravishankarUnsubmitted Done Reply Inline Actions You could set this to default false? mravishankar: You could set this to default false?
		rsudermanAuthorUnsubmitted Done Reply Inline Actions It is set to default to false in the `Passes.td` file. Added a default on the function definition as well. rsuderman: It is set to default to false in the `Passes.td` file. Added a default on the function…
target.addIllegalOp<tosa::ApplyScaleOp>();		target.addIllegalOp<tosa::ApplyScaleOp>();
}		}

if (failed(applyPartialConversion(getOperation(), target,		if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))		std::move(patterns))))
signalPassFailure();		signalPassFailure();
}		}
};		};
} // namespace		} // namespace

std::unique_ptr<Pass> mlir::tosa::createTosaToArith() {		std::unique_ptr<Pass> mlir::tosa::createTosaToArith() {
return std::make_unique<TosaToArith>();		return std::make_unique<TosaToArith>();
}		}

mlir/test/Conversion/TosaToArith/tosa-to-arith.mlir

	// RUN: mlir-opt --split-input-file --tosa-to-arith="include-apply-rescale=true" %s -verify-diagnostics -o -\| FileCheck %s			// RUN: mlir-opt --split-input-file --tosa-to-arith="include-apply-rescale=true use-32-bit=true" %s -verify-diagnostics -o -\| FileCheck %s
	// RUN: mlir-opt --split-input-file --tosa-to-arith="include-apply-rescale=false" %s -verify-diagnostics -o -\| FileCheck --check-prefix="SCALE" %s			// RUN: mlir-opt --split-input-file --tosa-to-arith="include-apply-rescale=false" %s -verify-diagnostics -o -\| FileCheck --check-prefix="SCALE" %s

	// CHECK-LABEL: func @const_test			// CHECK-LABEL: func @const_test
	func.func @const_test() -> (tensor<i32>) {			func.func @const_test() -> (tensor<i32>) {
	// CHECK: [[C3:%.+]] = arith.constant dense<3> : tensor<i32>			// CHECK: [[C3:%.+]] = arith.constant dense<3> : tensor<i32>
	%0 = "tosa.const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>			%result = "tosa.const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>

	// CHECK: return [[C3]]			// CHECK: return [[C3]]
	return %0 : tensor<i32>			return %result : tensor<i32>
	}			}

	// -----			// -----

	// CHECK-LABEL: @apply_scale_test_i32			// CHECK-LABEL: @apply_scale_test_i32
	func.func @apply_scale_test_i32(%arg0 : i32, %arg1 : i32, %arg2 : i8) -> (i32) {
	// CHECK-DAG: [[C1_8:%.+]] = arith.constant 1 : i8
	// CHECK-DAG: [[C1_32:%.+]] = arith.constant 1 : i32
	// CHECK-DAG: [[C1_64:%.+]] = arith.constant 1 : i64
	// CHECK-DAG: [[SHIFT_MINUS_ONE_8:%.+]] = arith.subi %arg2, [[C1_8]]

	// CHECK-DAG: [[SHIFT_32:%.+]] = arith.extsi %arg2 : i8 to i32
	// CHECK-DAG: [[SHIFT_MINUS_ONE_64:%.+]] = arith.extsi [[SHIFT_MINUS_ONE_8]] : i8 to i64
	// CHECK-DAG: [[SHIFTED_64:%.+]] = arith.shli [[C1_64]], [[SHIFT_MINUS_ONE_64]]

	// CHECK-DAG: [[C0_32:%.+]] = arith.constant 0 : i32
	// CHECK-DAG: [[C30_32:%.+]] = arith.constant 30 : i32
	// CHECK-DAG: [[SECOND_BIAS:%.+]] = arith.shli [[C1_32]], [[C30_32]]
	// CHECK-DAG: [[SECOND_BIAS_64:%.+]] = arith.extsi [[SECOND_BIAS]] : i32 to i64
	// CHECK-DAG: [[POSITIVE_ROUND:%.+]] = arith.addi [[SHIFTED_64]], [[SECOND_BIAS_64]]
	// CHECK-DAG: [[NEGATIVE_ROUND:%.+]] = arith.subi [[SHIFTED_64]], [[SECOND_BIAS_64]]
	// CHECK-DAG: [[VALUE_NEGATIVE:%.+]] = arith.cmpi sge, %arg0, [[C0_32]] : i32
	// CHECK-DAG: [[DOUBLE_ROUNDED:%.+]] = arith.select [[VALUE_NEGATIVE]], [[POSITIVE_ROUND]], [[NEGATIVE_ROUND]] : i64
	// CHECK-DAG: [[C32_32:%.+]] = arith.constant 32 : i32
	// CHECK-DAG: [[IS_32BIT_SHIFT:%.+]] = arith.cmpi sge, [[SHIFT_32]], [[C32_32]]
	// CHECK-DAG: [[ROUND:%.+]] = arith.select [[IS_32BIT_SHIFT]], [[DOUBLE_ROUNDED]], [[SHIFTED_64]]

	// CHECK-DAG: [[VAL_64:%.+]] = arith.extsi %arg0 : i32 to i64
	// CHECK-DAG: [[MULTIPLY_64:%.+]] = arith.extsi %arg1 : i32 to i64
	// CHECK-DAG: [[SHIFT_64:%.+]] = arith.extsi %arg2 : i8 to i64
	// CHECK-DAG: [[SCALED:%.+]] = arith.muli [[VAL_64]], [[MULTIPLY_64]]
	// CHECK-DAG: [[BIASED:%.+]] = arith.addi [[SCALED]], [[ROUND]]
	// CHECK-DAG: [[DOWNSHIFTED:%.+]] = arith.shrsi [[BIASED]], [[SHIFT_64]]
	// CHECK: [[TRUNCATED:%.+]] = arith.trunci [[DOWNSHIFTED]]

	// SCALE: "tosa.apply_scale"			// SCALE: "tosa.apply_scale"
	%0 = "tosa.apply_scale"(%arg0, %arg1, %arg2) {double_round = true} : (i32, i32, i8) -> i32			func.func @apply_scale_test_i32(%arg0 : i32, %arg1 : i32, %arg2 : i8) -> (i32) {
	return %0 : i32			// CHECK-DAG: %[[S32:.+]] = arith.extui %arg2 : i8 to i32
				// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : i32
				// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i32
				// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : i32
				// CHECK-DAG: %[[C30:.+]] = arith.constant 30 : i32
				// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : i32
				// CHECK-DAG: %[[C32L:.+]] = arith.constant 32 : i64

				// Compute the high-low values of the matmul in 64-bits.
				// CHECK-DAG: %[[V64:.+]] = arith.extsi %arg0 : i32 to i64
				// CHECK-DAG: %[[M64:.+]] = arith.extsi %arg1 : i32 to i64
				// CHECK-DAG: %[[MUL64:.+]] = arith.muli %[[V64]], %[[M64]]
				// CHECK-DAG: %[[HI64:.+]] = arith.shrui %[[MUL64]], %[[C32L]]
				// CHECK-DAG: %[[HI:.+]] = arith.trunci %[[HI64]] : i64 to i32
				// CHECK-DAG: %[[LOW:.+]] = arith.muli %arg0, %arg1

				// Determine whether the high bits need to shift left or right and by how much.
				// CHECK-DAG: %[[OVER31:.+]] = arith.cmpi sge, %[[S32]], %[[C32]]
				// CHECK-DAG: %[[OVER32:.+]] = arith.cmpi sgt, %[[S32]], %[[C32]]
				// CHECK-DAG: %[[HISHLN:.+]] = arith.subi %[[C32]], %[[S32]]
				// CHECK-DAG: %[[HISHRN:.+]] = arith.subi %[[S32]], %[[C32]]
				// CHECK-DAG: %[[HISHL:.+]] = arith.select %[[OVER31]], %[[C0]], %[[HISHLN]]
				// CHECK-DAG: %[[HISHR:.+]] = arith.select %[[OVER31]], %[[HISHRN]], %[[C0]]

				// Apply double rounding.
				// CHECK-DAG: %[[CN1:.+]] = arith.constant -1
				// CHECK-DAG: %[[POS:.+]] = arith.cmpi sge, %arg0, %[[C0]]
				// CHECK-DAG: %[[DIR:.+]] = arith.select %[[POS]], %[[C1]], %[[CN1]]
				// CHECK-DAG: %[[DRND:.+]] = arith.select %[[OVER31]], %[[DIR]], %[[C0]]
				// CHECK-DAG: %[[DSHFTR:.+]] = arith.shrui %[[LOW]], %[[C30]]
				// CHECK-DAG: %[[DRNDED:.+]] = arith.addi %[[DSHFTR]], %[[DRND]]
				// CHECK-DAG: %[[DCARRY:.+]] = arith.shrsi %[[DRNDED]], %[[C2:.+]]
				// CHECK-DAG: %[[DBIT:.+]] = arith.shli %[[DRND]], %[[C30]]
				// CHECK-DAG: %[[DLOW:.+]] = arith.addi %[[LOW]], %[[DBIT]]
				// CHECK-DAG: %[[DHI:.+]] = arith.addi %[[HI]], %[[DCARRY]]

				// Apply low-bit rounding.
				// CHECK-DAG: %[[SHFTM1:.+]] = arith.subi %[[S32]], %[[C1]]
				// CHECK-DAG: %[[LBIT:.+]] = arith.shli %[[C1]], %[[SHFTM1]]
				// CHECK-DAG: %[[HALF:.+]] = arith.select %[[OVER32]], %[[C0]], %[[LBIT]]
				// CHECK-DAG: %[[LADD:.+]] = arith.addi %[[DLOW]], %[[HALF]]
				// CHECK-DAG: %[[LLO:.+]] = arith.cmpi ugt, %[[DLOW]], %[[LADD]]
				// CHECK-DAG: %[[LCARRY:.+]] = arith.extui %[[LLO]] : i1 to i32
				// CHECK-DAG: %[[LRNDED:.+]] = arith.addi %[[DHI]], %[[LCARRY]]

				// Apply high-bit rounding.
				// CHECK-DAG: %[[HISHRM1:.+]] = arith.subi %[[HISHR]], %[[C1]]
				// CHECK-DAG: %[[LHISHFT:.+]] = arith.shli %[[C1]], %[[HISHRM1]]
				// CHECK-DAG: %[[LHI:.+]] = arith.select %[[OVER32]], %[[LHISHFT]], %[[C0]]
				// CHECK-DAG: %[[FHI:.+]] = arith.addi %[[LRNDED]], %[[LHI]]

				// Combine hi-low into the final result.
				// CHECK-DAG: %[[HIL:.+]] = arith.shli %[[FHI]], %[[HISHL]]
				// CHECK-DAG: %[[HIALIGN:.+]] = arith.shrsi %[[HIL:.+]], %[[HISHR]]
				// CHECK-DAG: %[[LOR:.+]] = arith.shrui %[[LADD]], %[[S32]]
				// CHECK-DAG: %[[LOWALIGN:.+]] = arith.select %[[OVER31]], %[[C0]], %[[LOR]]
				// CHECK-DAG: %[[RESULT:.+]] = arith.addi %[[LOWALIGN]], %[[HIALIGN]]
				// CHECK: return %[[RESULT]]
				%res = "tosa.apply_scale"(%arg0, %arg1, %arg2) {double_round = true} : (i32, i32, i8) -> i32
				return %res : i32
	}			}

	// -----			// -----

	// CHECK-LABEL: @apply_scale_test_vector			// CHECK-LABEL: @apply_scale_test_vector
				// SCALE: "tosa.apply_scale"
	func.func @apply_scale_test_vector(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>, %arg2 : vector<4xi8>) -> (vector<4xi32>) {			func.func @apply_scale_test_vector(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>, %arg2 : vector<4xi8>) -> (vector<4xi32>) {
	// CHECK-DAG: [[C1_8:%.+]] = arith.constant dense<1> : vector<4xi8>			// CHECK-NOT: "tosa.apply_scale"
	// CHECK-DAG: [[C1_32:%.+]] = arith.constant dense<1> : vector<4xi32>			%res = "tosa.apply_scale"(%arg0, %arg1, %arg2) {double_round = true} : (vector<4xi32>, vector<4xi32>, vector<4xi8>) -> vector<4xi32>
	// CHECK-DAG: [[C1_64:%.+]] = arith.constant dense<1> : vector<4xi64>			return %res : vector<4xi32>
	// CHECK-DAG: [[SHIFT_MINUS_ONE_8:%.+]] = arith.subi %arg2, [[C1_8]]

	// CHECK-DAG: [[SHIFT_32:%.+]] = arith.extsi %arg2 : vector<4xi8> to vector<4xi32>
	// CHECK-DAG: [[SHIFT_MINUS_ONE_64:%.+]] = arith.extsi [[SHIFT_MINUS_ONE_8]] : vector<4xi8> to vector<4xi64>
	// CHECK-DAG: [[SHIFTED_64:%.+]] = arith.shli [[C1_64]], [[SHIFT_MINUS_ONE_64]]

	// CHECK-DAG: [[C0_32:%.+]] = arith.constant dense<0> : vector<4xi32>
	// CHECK-DAG: [[C30_32:%.+]] = arith.constant dense<30> : vector<4xi32>
	// CHECK-DAG: [[SECOND_BIAS:%.+]] = arith.shli [[C1_32]], [[C30_32]]
	// CHECK-DAG: [[SECOND_BIAS_64:%.+]] = arith.extsi [[SECOND_BIAS]] : vector<4xi32> to vector<4xi64>
	// CHECK-DAG: [[POSITIVE_ROUND:%.+]] = arith.addi [[SHIFTED_64]], [[SECOND_BIAS_64]]
	// CHECK-DAG: [[NEGATIVE_ROUND:%.+]] = arith.subi [[SHIFTED_64]], [[SECOND_BIAS_64]]
	// CHECK-DAG: [[VALUE_NEGATIVE:%.+]] = arith.cmpi sge, %arg0, [[C0_32]] : vector<4xi32>
	// CHECK-DAG: [[DOUBLE_ROUNDED:%.+]] = arith.select [[VALUE_NEGATIVE]], [[POSITIVE_ROUND]], [[NEGATIVE_ROUND]] : vector<4xi1>, vector<4xi64>
	// CHECK-DAG: [[C32_32:%.+]] = arith.constant dense<32> : vector<4xi32>
	// CHECK-DAG: [[IS_32BIT_SHIFT:%.+]] = arith.cmpi sge, [[SHIFT_32]], [[C32_32]]
	// CHECK-DAG: [[ROUND:%.+]] = arith.select [[IS_32BIT_SHIFT]], [[DOUBLE_ROUNDED]], [[SHIFTED_64]]

	// CHECK-DAG: [[VAL_64:%.+]] = arith.extsi %arg0 : vector<4xi32> to vector<4xi64>
	// CHECK-DAG: [[MULTIPLY_64:%.+]] = arith.extsi %arg1 : vector<4xi32> to vector<4xi64>
	// CHECK-DAG: [[SHIFT_64:%.+]] = arith.extsi %arg2 : vector<4xi8> to vector<4xi64>
	// CHECK-DAG: [[SCALED:%.+]] = arith.muli [[VAL_64]], [[MULTIPLY_64]]
	// CHECK-DAG: [[BIASED:%.+]] = arith.addi [[SCALED]], [[ROUND]]
	// CHECK-DAG: [[DOWNSHIFTED:%.+]] = arith.shrsi [[BIASED]], [[SHIFT_64]]
	// CHECK: [[TRUNCATED:%.+]] = arith.trunci [[DOWNSHIFTED]]

	%0 = "tosa.apply_scale"(%arg0, %arg1, %arg2) {double_round = true} : (vector<4xi32>, vector<4xi32>, vector<4xi8>) -> vector<4xi32>
	return %0 : vector<4xi32>
	}			}

	// -----			// -----

	// CHECK-LABEL: @apply_scale_test_i48			// CHECK-LABEL: @apply_scale_test_i48
				// SCALE: "tosa.apply_scale"
	func.func @apply_scale_test_i48(%arg0 : i48, %arg1 : i32, %arg2 : i8) -> (i32) {			func.func @apply_scale_test_i48(%arg0 : i48, %arg1 : i32, %arg2 : i8) -> (i32) {
	// CHECK-DAG: [[C1_8:%.+]] = arith.constant 1 : i8			// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : i48
	// CHECK-DAG: [[C1_32:%.+]] = arith.constant 1 : i32			// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i64
	// CHECK-DAG: [[C1_64:%.+]] = arith.constant 1 : i64			// CHECK-DAG: %[[C31:.+]] = arith.constant 31 : i32
	// CHECK-DAG: [[C30_32:%.+]] = arith.constant 30 : i32
	// CHECK-DAG: [[C0_32:%.+]] = arith.constant 0 : i48			// Multiply in 64 bits.
	// CHECK-DAG: [[C32_32:%.+]] = arith.constant 32 : i32			// CHECK-DAG: %[[V64:.+]] = arith.extsi %arg0 : i48 to i64
	// CHECK-DAG: [[SHIFT_MINUS_ONE_8:%.+]] = arith.subi %arg2, [[C1_8]]			// CHECK-DAG: %[[M64:.+]] = arith.extsi %arg1 : i32 to i64
	// CHECK-DAG: [[SHIFT_32:%.+]] = arith.extsi %arg2 : i8 to i32			// CHECK-DAG: %[[MUL:.+]] = arith.muli %[[V64]], %[[M64]]
	// CHECK-DAG: [[SHIFT_MINUS_ONE_64:%.+]] = arith.extsi [[SHIFT_MINUS_ONE_8]] : i8 to i64
	// CHECK-DAG: [[SHIFTED_64:%.+]] = arith.shli [[C1_64]], [[SHIFT_MINUS_ONE_64]]			// Round normally.
	// CHECK-DAG: [[SECOND_BIAS:%.+]] = arith.shli [[C1_32]], [[C30_32]]			// CHECK-DAG: %[[S32:.+]] = arith.extui %arg2 : i8 to i32
	// CHECK-DAG: [[SECOND_BIAS_64:%.+]] = arith.extsi [[SECOND_BIAS]] : i32 to i64			// CHECK-DAG: %[[S64:.+]] = arith.extui %[[S32]] : i32 to i64
	// CHECK-DAG: [[POSITIVE_ROUND:%.+]] = arith.addi [[SHIFTED_64]], [[SECOND_BIAS_64]]			// CHECK-DAG: %[[ONEL:.+]] = arith.shli %[[C1]], %[[S64]] : i64
	// CHECK-DAG: [[NEGATIVE_ROUND:%.+]] = arith.subi [[SHIFTED_64]], [[SECOND_BIAS_64]]			// CHECK-DAG: %[[ONER:.+]] = arith.shrui %[[ONEL]], %[[C1]]
	// CHECK-DAG: [[VALUE_NEGATIVE:%.+]] = arith.cmpi sge, %arg0, [[C0_32]] : i48			// CHECK-DAG: %[[ROUND:.+]] = arith.addi %[[MUL]], %[[ONER]]
	// CHECK-DAG: [[DOUBLE_ROUNDED:%.+]] = arith.select [[VALUE_NEGATIVE]], [[POSITIVE_ROUND]], [[NEGATIVE_ROUND]] : i64
	// CHECK-DAG: [[IS_32BIT_SHIFT:%.+]] = arith.cmpi sge, [[SHIFT_32]], [[C32_32]]			// Apply double rounding.
	// CHECK-DAG: [[ROUND:%.+]] = arith.select [[IS_32BIT_SHIFT]], [[DOUBLE_ROUNDED]], [[SHIFTED_64]]			// CHECK-DAG: %[[DUP:.+]] = arith.constant 1073741824 : i64
	// CHECK-DAG: [[VAL_64:%.+]] = arith.extsi %arg0 : i48 to i64			// CHECK-DAG: %[[DDOWN:.+]] = arith.constant -1073741824 : i64
	// CHECK-DAG: [[MULTIPLY_64:%.+]] = arith.extsi %arg1 : i32 to i64			// CHECK-DAG: %[[POS:.+]] = arith.cmpi sge, %arg0, %[[C0]]
	// CHECK-DAG: [[SHIFT_64:%.+]] = arith.extsi %arg2 : i8 to i64			// CHECK-DAG: %[[DBIT:.+]] = arith.select %[[POS]], %[[DUP]], %[[DDOWN]]
	// CHECK-DAG: [[SCALED:%.+]] = arith.muli [[VAL_64]], [[MULTIPLY_64]]			// CHECK-DAG: %[[DRND:.+]] = arith.addi %[[DBIT]], %[[ROUND]]
	// CHECK-DAG: [[BIASED:%.+]] = arith.addi [[SCALED]], [[ROUND]]			// CHECK-DAG: %[[USED:.+]] = arith.cmpi sgt, %[[S32]], %[[C31]] : i32
	// CHECK-DAG: [[DOWNSHIFTED:%.+]] = arith.shrsi [[BIASED]], [[SHIFT_64]]			// CHECK-DAG: %[[RES64:.+]] = arith.select %[[USED]], %[[DRND]], %[[ROUND]] : i64
	// CHECK: [[TRUNCATED:%.+]] = arith.trunci [[DOWNSHIFTED]]
	%0 = "tosa.apply_scale"(%arg0, %arg1, %arg2) {double_round = true} : (i48, i32, i8) -> i32			// Shift and truncate final answer.
	return %0 : i32			// CHECK-DAG: %[[SHR:.+]] = arith.shrsi %[[RES64]], %[[S64]]
				// CHECK-DAG: %[[TRUNC:.+]] = arith.trunci %[[SHR]] : i64 to i32
				// CHECK: return %[[TRUNC]]
				%res = "tosa.apply_scale"(%arg0, %arg1, %arg2) {double_round = true} : (i48, i32, i8) -> i32
				return %res : i32
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][tosa] Rework tosa.apply_scale lowering for 32-bit
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 429841

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/TosaToArith/TosaToArith.h

mlir/lib/Conversion/TosaToArith/TosaToArith.cpp

mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp

mlir/test/Conversion/TosaToArith/tosa-to-arith.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][tosa] Rework tosa.apply_scale lowering for 32-bitClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 429841

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/TosaToArith/TosaToArith.h

mlir/lib/Conversion/TosaToArith/TosaToArith.cpp

mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp

mlir/test/Conversion/TosaToArith/tosa-to-arith.mlir

[mlir][tosa] Rework tosa.apply_scale lowering for 32-bit
ClosedPublic