Diff 460790

mlir/lib/Dialect/Arithmetic/Transforms/EmulateWideInt.cpp

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines static Type reduceInnermostDim(VectorType type) {

if (type.getShape().size() == 1) if (type.getShape().size() == 1)

return type.getElementType(); return type.getElementType();

auto newShape = to_vector(type.getShape()); auto newShape = to_vector(type.getShape());

newShape.back() = 1; newShape.back() = 1;

return VectorType::get(newShape, type.getElementType()); return VectorType::get(newShape, type.getElementType());

} }

// Returns a constant of integer of vector type filled with (repeated) `value`.

MogballUnsubmitted

Not Done

return VectorType::get(newShape, type.getElementType());

}

- // Returns a constant of integer of vector type filled with (repeated) `value`.

+ /// Returns a constant of integer of vector type filled with (repeated) `value`.

static Value createScalarOrSplatConstant(ConversionPatternRewriter &rewriter,

Mogball:

static Value createScalarOrSplatConstant(ConversionPatternRewriter &rewriter,

Location loc, Type type,

const APInt &value) {

Attribute attr;

if (auto intTy = type.dyn_cast<IntegerType>()) {

attr = rewriter.getIntegerAttr(type, value);

} else {

auto vecTy = type.cast<VectorType>();

attr = SplatElementsAttr::get(vecTy, value);

}

return rewriter.create<arith::ConstantOp>(loc, attr);

}

// Returns a constant of integer of vector type filled with (repeated) `value`.

MogballUnsubmitted

Not Done

return rewriter.create<arith::ConstantOp>(loc, attr);

}

- // Returns a constant of integer of vector type filled with (repeated) `value`.

+ /// Returns a constant of integer of vector type filled with (repeated) `value`.

static Value createScalarOrSplatConstant(ConversionPatternRewriter &rewriter,

Mogball:

static Value createScalarOrSplatConstant(ConversionPatternRewriter &rewriter,

Location loc, Type type,

int64_t value) {

unsigned elementBitWidth = 0;

if (auto intTy = type.dyn_cast<IntegerType>())

elementBitWidth = intTy.getWidth();

else

elementBitWidth = type.cast<VectorType>().getElementTypeBitWidth();

return createScalarOrSplatConstant(rewriter, loc, type,

APInt(elementBitWidth, value));

}

// Extracts the `input` vector slice with elements at the last dimension offset // Extracts the `input` vector slice with elements at the last dimension offset

// by `lastOffset`. Returns a value of vector type with the last dimension // by `lastOffset`. Returns a value of vector type with the last dimension

// reduced to x1 or fully scalarized, e.g.: // reduced to x1 or fully scalarized, e.g.:

// - vector<3x2xi16> --> vector<3x1xi16> // - vector<3x2xi16> --> vector<3x1xi16>

// - vector<2xi16> --> i16 // - vector<2xi16> --> i16

static Value extractLastDimSlice(ConversionPatternRewriter &rewriter, static Value extractLastDimSlice(ConversionPatternRewriter &rewriter,

Location loc, Value input, Location loc, Value input,

int64_t lastOffset) { int64_t lastOffset) {

▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines static Value constructResultVector(ConversionPatternRewriter &rewriter,

Location loc, VectorType resultType, Location loc, VectorType resultType,

ValueRange resultComponents) { ValueRange resultComponents) {

llvm::ArrayRef<int64_t> resultShape = resultType.getShape(); llvm::ArrayRef<int64_t> resultShape = resultType.getShape();

(void)resultShape; (void)resultShape;

assert(!resultShape.empty() && "Result expected to have dimentions"); assert(!resultShape.empty() && "Result expected to have dimentions");

assert(resultShape.back() == static_cast<int64_t>(resultComponents.size()) && assert(resultShape.back() == static_cast<int64_t>(resultComponents.size()) &&

"Wrong number of result components"); "Wrong number of result components");

Value resultVec = Value resultVec = createScalarOrSplatConstant(rewriter, loc, resultType, 0);

rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(resultType));

for (auto [i, component] : llvm::enumerate(resultComponents)) for (auto [i, component] : llvm::enumerate(resultComponents))

resultVec = insertLastDimSlice(rewriter, loc, component, resultVec, i); resultVec = insertLastDimSlice(rewriter, loc, component, resultVec, i);

return resultVec; return resultVec;

} }

namespace { namespace {

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines

struct ConvertAddI final : OpConversionPattern<arith::AddIOp> { struct ConvertAddI final : OpConversionPattern<arith::AddIOp> {

using OpConversionPattern::OpConversionPattern; using OpConversionPattern::OpConversionPattern;

LogicalResult LogicalResult

matchAndRewrite(arith::AddIOp op, OpAdaptor adaptor, matchAndRewrite(arith::AddIOp op, OpAdaptor adaptor,

ConversionPatternRewriter &rewriter) const override { ConversionPatternRewriter &rewriter) const override {

Location loc = op->getLoc(); Location loc = op->getLoc();

Value lhs = adaptor.getLhs();

Value rhs = adaptor.getRhs();

auto newTy = getTypeConverter() auto newTy = getTypeConverter()

->convertType(op.getType()) ->convertType(op.getType())

.dyn_cast_or_null<VectorType>(); .dyn_cast_or_null<VectorType>();

if (!newTy) if (!newTy)

return rewriter.notifyMatchFailure(loc, "expected scalar or vector type"); return rewriter.notifyMatchFailure(loc, "expected scalar or vector type");

Type newElemTy = reduceInnermostDim(newTy); Type newElemTy = reduceInnermostDim(newTy);

auto [lhsElem0, lhsElem1] = extractLastDimHalves(rewriter, loc, lhs); auto [lhsElem0, lhsElem1] =

auto [rhsElem0, rhsElem1] = extractLastDimHalves(rewriter, loc, rhs); extractLastDimHalves(rewriter, loc, adaptor.getLhs());

auto [rhsElem0, rhsElem1] =

extractLastDimHalves(rewriter, loc, adaptor.getRhs());

auto lowSum = rewriter.create<arith::AddUICarryOp>(loc, lhsElem0, rhsElem0); auto lowSum = rewriter.create<arith::AddUICarryOp>(loc, lhsElem0, rhsElem0);

Value carryVal = Value carryVal =

rewriter.create<arith::ExtUIOp>(loc, newElemTy, lowSum.getCarry()); rewriter.create<arith::ExtUIOp>(loc, newElemTy, lowSum.getCarry());

Value high0 = rewriter.create<arith::AddIOp>(loc, carryVal, lhsElem1); Value high0 = rewriter.create<arith::AddIOp>(loc, carryVal, lhsElem1);

Value high = rewriter.create<arith::AddIOp>(loc, high0, rhsElem1); Value high = rewriter.create<arith::AddIOp>(loc, high0, rhsElem1);

Value resultVec = Value resultVec =

constructResultVector(rewriter, loc, newTy, {lowSum.getSum(), high}); constructResultVector(rewriter, loc, newTy, {lowSum.getSum(), high});

rewriter.replaceOp(op, resultVec); rewriter.replaceOp(op, resultVec);

return success(); return success();

} }

}; };

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

// ConvertMulI

//===----------------------------------------------------------------------===//

struct ConvertMulI final : OpConversionPattern<arith::MulIOp> {

using OpConversionPattern::OpConversionPattern;

LogicalResult

matchAndRewrite(arith::MulIOp op, OpAdaptor adaptor,

ConversionPatternRewriter &rewriter) const override {

Location loc = op->getLoc();

auto newTy = getTypeConverter()

->convertType(op.getType())

.dyn_cast_or_null<VectorType>();

if (!newTy)

return rewriter.notifyMatchFailure(loc, "expected scalar or vector type");

Type newElemTy = reduceInnermostDim(newTy);

unsigned newBitWidth = newTy.getElementTypeBitWidth();

unsigned digitBitWidth = newBitWidth / 2;

auto [lhsElem0, lhsElem1] =

extractLastDimHalves(rewriter, loc, adaptor.getLhs());

auto [rhsElem0, rhsElem1] =

extractLastDimHalves(rewriter, loc, adaptor.getRhs());

// Emulate multiplication by splitting each input element of type i2N into 4

// digits of type iN and bit width i(N/2). This is so that the intermediate

// multiplications and additions do not overflow. We extract these i(N/2)

// digits from iN vector elements by masking (low digit) and shifting right

// (high digit).

// The multiplication algorithm used is the standard (long) multiplication.

// Multiplying two i2N integers produces (at most) a i4N result, but because

// the calculation of top i2N is not necessary, we omit it.

// In total, this implementations performs 10 intermediate multiplications

// and 16 additions. The number of multiplications could be decreased by

// switching to a more efficient algorithm like Karatsuba. This would,

// however, require being able to perform (intermediate) wide additions and

// subtractions, so it is not clear that such implementation would be more

// efficient.

APInt lowMaskVal(newBitWidth, 1);

lowMaskVal = lowMaskVal.shl(digitBitWidth) - 1;

Value lowMask =

createScalarOrSplatConstant(rewriter, loc, newElemTy, lowMaskVal);

auto getLowDigit = [lowMask, newElemTy, loc, &rewriter](Value v) {

return rewriter.create<arith::AndIOp>(loc, newElemTy, v, lowMask);

};

Value shiftVal =

createScalarOrSplatConstant(rewriter, loc, newElemTy, digitBitWidth);

auto getHighDigit = [shiftVal, loc, &rewriter](Value v) {

return rewriter.create<arith::ShRUIOp>(loc, v, shiftVal);

};

Value zeroDigit = createScalarOrSplatConstant(rewriter, loc, newElemTy, 0);

std::array<Value, 4> resultDigits = {zeroDigit, zeroDigit, zeroDigit,

zeroDigit};

std::array<Value, 4> lhsDigits = {

getLowDigit(lhsElem0), getHighDigit(lhsElem0), getLowDigit(lhsElem1),

getHighDigit(lhsElem1)};

std::array<Value, 4> rhsDigits = {

getLowDigit(rhsElem0), getHighDigit(rhsElem0), getLowDigit(rhsElem1),

getHighDigit(rhsElem1)};

for (unsigned i = 0, e = lhsDigits.size(); i != e; ++i) {

for (unsigned j = 0; i + j != e; ++j) {

MogballUnsubmitted

Done

It looks like i+j is used more frequency in the loop than j. Can you make the for loop for (unsigned j = i; j != e; ++j)?

Mogball: It looks like `i+j` is used more frequency in the loop than `j`. Can you make the for loop `for…

kuharAuthorUnsubmitted

Done

I think it's more straightforward as-is. If we defined j as i, we would have to introduce a subtraction to access rhsDigits.

kuhar: I think it's more straightforward as-is. If we defined `j` as `i`, we would have to introduce a…

Value mul =

rewriter.create<arith::MulIOp>(loc, lhsDigits[i], rhsDigits[j]);

Value current =

rewriter.createOrFold<arith::AddIOp>(loc, resultDigits[i + j], mul);

resultDigits[i + j] = getLowDigit(current);

if (i + j + 1 != e) {

Value carry = rewriter.createOrFold<arith::AddIOp>(

loc, resultDigits[i + j + 1], getHighDigit(current));

resultDigits[i + j + 1] = carry;

}

auto combineDigits = [shiftVal, loc, &rewriter](Value low, Value high) {

Value highBits = rewriter.create<arith::ShLIOp>(loc, high, shiftVal);

return rewriter.create<arith::OrIOp>(loc, low, highBits);

};

Value resultElem0 = combineDigits(resultDigits[0], resultDigits[1]);

Value resultElem1 = combineDigits(resultDigits[2], resultDigits[3]);

Value resultVec =

constructResultVector(rewriter, loc, newTy, {resultElem0, resultElem1});

rewriter.replaceOp(op, resultVec);

return success();

}

};

//===----------------------------------------------------------------------===//

// ConvertExtSI // ConvertExtSI

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

struct ConvertExtSI final : OpConversionPattern<arith::ExtSIOp> { struct ConvertExtSI final : OpConversionPattern<arith::ExtSIOp> {

using OpConversionPattern::OpConversionPattern; using OpConversionPattern::OpConversionPattern;

LogicalResult LogicalResult

matchAndRewrite(arith::ExtSIOp op, OpAdaptor adaptor, matchAndRewrite(arith::ExtSIOp op, OpAdaptor adaptor,

ConversionPatternRewriter &rewriter) const override { ConversionPatternRewriter &rewriter) const override {

Location loc = op->getLoc(); Location loc = op->getLoc();

auto newTy = getTypeConverter() auto newTy = getTypeConverter()

->convertType(op.getType()) ->convertType(op.getType())

.dyn_cast_or_null<VectorType>(); .dyn_cast_or_null<VectorType>();

if (!newTy) if (!newTy)

return rewriter.notifyMatchFailure(loc, "unsupported type"); return rewriter.notifyMatchFailure(loc, "unsupported type");

Type newResultComponentTy = reduceInnermostDim(newTy); Type newResultComponentTy = reduceInnermostDim(newTy);

// Sign-extend the input value to determine the low half of the result. // Sign-extend the input value to determine the low half of the result.

// Then, check if the low half is negative, and sign-extend the comparison // Then, check if the low half is negative, and sign-extend the comparison

// result to get the high half. // result to get the high half.

Value newOperand = appendX1Dim(rewriter, loc, adaptor.getIn()); Value newOperand = appendX1Dim(rewriter, loc, adaptor.getIn());

Value extended = rewriter.createOrFold<arith::ExtSIOp>( Value extended = rewriter.createOrFold<arith::ExtSIOp>(

loc, newResultComponentTy, newOperand); loc, newResultComponentTy, newOperand);

Value operandZeroCst = rewriter.create<arith::ConstantOp>( Value operandZeroCst =

loc, rewriter.getZeroAttr(newResultComponentTy)); createScalarOrSplatConstant(rewriter, loc, newResultComponentTy, 0);

Value signBit = rewriter.create<arith::CmpIOp>( Value signBit = rewriter.create<arith::CmpIOp>(

loc, arith::CmpIPredicate::slt, extended, operandZeroCst); loc, arith::CmpIPredicate::slt, extended, operandZeroCst);

Value signValue = Value signValue =

rewriter.create<arith::ExtSIOp>(loc, newResultComponentTy, signBit); rewriter.create<arith::ExtSIOp>(loc, newResultComponentTy, signBit);

Value resultVec = Value resultVec =

constructResultVector(rewriter, loc, newTy, {extended, signValue}); constructResultVector(rewriter, loc, newTy, {extended, signValue});

rewriter.replaceOp(op, resultVec); rewriter.replaceOp(op, resultVec);

Show All 20 Lines matchAndRewrite(arith::ExtUIOp op, OpAdaptor adaptor,

Type newResultComponentTy = reduceInnermostDim(newTy); Type newResultComponentTy = reduceInnermostDim(newTy);

// Zero-extend the input value to determine the low half of the result. // Zero-extend the input value to determine the low half of the result.

// The high half is always zero. // The high half is always zero.

Value newOperand = appendX1Dim(rewriter, loc, adaptor.getIn()); Value newOperand = appendX1Dim(rewriter, loc, adaptor.getIn());

Value extended = rewriter.createOrFold<arith::ExtUIOp>( Value extended = rewriter.createOrFold<arith::ExtUIOp>(

loc, newResultComponentTy, newOperand); loc, newResultComponentTy, newOperand);

Value zeroCst = rewriter.create<arith::ConstantOp>( Value zeroCst = createScalarOrSplatConstant(rewriter, loc, newTy, 0);

op->getLoc(), rewriter.getZeroAttr(newTy));

Value newRes = insertLastDimSlice(rewriter, loc, extended, zeroCst, 0); Value newRes = insertLastDimSlice(rewriter, loc, extended, zeroCst, 0);

rewriter.replaceOp(op, newRes); rewriter.replaceOp(op, newRes);

return success(); return success();

} }

}; };

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

// ConvertTruncI // ConvertTruncI

▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines

// Pass Definition // Pass Definition

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

struct EmulateWideIntPass final struct EmulateWideIntPass final

: arith::impl::ArithmeticEmulateWideIntBase<EmulateWideIntPass> { : arith::impl::ArithmeticEmulateWideIntBase<EmulateWideIntPass> {

using ArithmeticEmulateWideIntBase::ArithmeticEmulateWideIntBase; using ArithmeticEmulateWideIntBase::ArithmeticEmulateWideIntBase;

void runOnOperation() override { void runOnOperation() override {

if (!llvm::isPowerOf2_32(widestIntSupported)) { if (!llvm::isPowerOf2_32(widestIntSupported) || widestIntSupported < 2) {

signalPassFailure(); signalPassFailure();

return; return;

} }

Operation *op = getOperation(); Operation *op = getOperation();

MLIRContext *ctx = op->getContext(); MLIRContext *ctx = op->getContext();

arith::WideIntEmulationConverter typeConverter(widestIntSupported); arith::WideIntEmulationConverter typeConverter(widestIntSupported);

Show All 20 Lines

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

// Public Interface Definition // Public Interface Definition

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

arith::WideIntEmulationConverter::WideIntEmulationConverter( arith::WideIntEmulationConverter::WideIntEmulationConverter(

unsigned widestIntSupportedByTarget) unsigned widestIntSupportedByTarget)

: maxIntWidth(widestIntSupportedByTarget) { : maxIntWidth(widestIntSupportedByTarget) {

assert(llvm::isPowerOf2_32(widestIntSupportedByTarget) && assert(llvm::isPowerOf2_32(widestIntSupportedByTarget) &&

"Only power-of-two integers are supported"); "Only power-of-two integers with are supported");

assert(widestIntSupportedByTarget >= 2 && "Integer type too narrow");

// Scalar case. // Scalar case.

addConversion([this](IntegerType ty) -> Optional<Type> { addConversion([this](IntegerType ty) -> Optional<Type> {

unsigned width = ty.getWidth(); unsigned width = ty.getWidth();

if (width <= maxIntWidth) if (width <= maxIntWidth)

return ty; return ty;

// i2N --> vector<2xiN> // i2N --> vector<2xiN>

▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines void arith::populateWideIntEmulationPatterns(

populateCallOpTypeConversionPattern(patterns, typeConverter); populateCallOpTypeConversionPattern(patterns, typeConverter);

populateReturnOpTypeConversionPattern(patterns, typeConverter); populateReturnOpTypeConversionPattern(patterns, typeConverter);

// Populate `arith.*` conversion patterns. // Populate `arith.*` conversion patterns.

patterns.add< patterns.add<

// Misc ops. // Misc ops.

ConvertConstant, ConvertVectorPrint, ConvertConstant, ConvertVectorPrint,

// Binary ops. // Binary ops.

ConvertAddI, ConvertAddI, ConvertMulI,

// Extension and truncation ops. // Extension and truncation ops.

ConvertExtSI, ConvertExtUI, ConvertTruncI>(typeConverter, ConvertExtSI, ConvertExtUI, ConvertTruncI>(typeConverter,

patterns.getContext()); patterns.getContext());

} }

mlir/test/Dialect/Arithmetic/emulate-wide-int-very-wide.mlir

This file was added.

				// Check that emulation of wery wide types (>64 bits) works as expected.

				// RUN: mlir-opt --arith-emulate-wide-int="widest-int-supported=512" %s \| FileCheck %s

				// CHECK-LABEL: func.func @muli_scalar
				// CHECK-SAME: ([[ARG0:%.+]]: vector<2xi512>, [[ARG1:%.+]]: vector<2xi512>) -> vector<2xi512>
				// CHECK-NEXT: [[LOW0:%.+]] = vector.extract [[ARG0]][0] : vector<2xi512>
				// CHECK-NEXT: [[HIGH0:%.+]] = vector.extract [[ARG0]][1] : vector<2xi512>
				// CHECK-NEXT: [[LOW1:%.+]] = vector.extract [[ARG1]][0] : vector<2xi512>
				// CHECK-NEXT: [[HIGH1:%.+]] = vector.extract [[ARG1]][1] : vector<2xi512>
				//
				// Check that the mask for the low 256-bits was generated correctly. The exact expected value is 2^256 - 1.
				// CHECK-NEXT: {{.+}} = arith.constant 115792089237316195423570985008687907853269984665640564039457584007913129639935 : i512
				// CHECK: return {{%.+}} : vector<2xi512>
				func.func @muli_scalar(%a : i1024, %b : i1024) -> i1024 {
				%m = arith.muli %a, %b : i1024
				return %m : i1024
				}

mlir/test/Dialect/Arithmetic/emulate-wide-int.mlir

	Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines
	// CHECK-NEXT: [[EXTR:%.+]] = vector.extract_strided_slice [[ARG]] {offsets = [0, 0], sizes = [3, 1], strides = [1, 1]} : vector<3x2xi32> to vector<3x1xi32>			// CHECK-NEXT: [[EXTR:%.+]] = vector.extract_strided_slice [[ARG]] {offsets = [0, 0], sizes = [3, 1], strides = [1, 1]} : vector<3x2xi32> to vector<3x1xi32>
	// CHECK-NEXT: [[SHAPE:%.+]] = vector.shape_cast [[EXTR]] : vector<3x1xi32> to vector<3xi32>			// CHECK-NEXT: [[SHAPE:%.+]] = vector.shape_cast [[EXTR]] : vector<3x1xi32> to vector<3xi32>
	// CHECK-NEXT: [[TRNC:%.+]] = arith.trunci [[SHAPE]] : vector<3xi32> to vector<3xi16>			// CHECK-NEXT: [[TRNC:%.+]] = arith.trunci [[SHAPE]] : vector<3xi32> to vector<3xi16>
	// CHECK-NEXT: return [[TRNC]] : vector<3xi16>			// CHECK-NEXT: return [[TRNC]] : vector<3xi16>
	func.func @trunci_vector(%a : vector<3xi64>) -> vector<3xi16> {			func.func @trunci_vector(%a : vector<3xi64>) -> vector<3xi16> {
	%b = arith.trunci %a : vector<3xi64> to vector<3xi16>			%b = arith.trunci %a : vector<3xi64> to vector<3xi16>
	return %b : vector<3xi16>			return %b : vector<3xi16>
	}			}

				// CHECK-LABEL: func.func @muli_scalar
				// CHECK-SAME: ([[ARG0:%.+]]: vector<2xi32>, [[ARG1:%.+]]: vector<2xi32>) -> vector<2xi32>
				// CHECK-NEXT: [[LOW0:%.+]] = vector.extract [[ARG0]][0] : vector<2xi32>
				// CHECK-NEXT: [[HIGH0:%.+]] = vector.extract [[ARG0]][1] : vector<2xi32>
				// CHECK-NEXT: [[LOW1:%.+]] = vector.extract [[ARG1]][0] : vector<2xi32>
				// CHECK-NEXT: [[HIGH1:%.+]] = vector.extract [[ARG1]][1] : vector<2xi32>
				//
				// CHECK-DAG: [[MASK:%.+]] = arith.constant 65535 : i32
				// CHECK-DAG: [[C16:%.+]] = arith.constant 16 : i32
				//
				// CHECK: [[LOWLOW0:%.+]] = arith.andi [[LOW0]], [[MASK]] : i32
				// CHECK-NEXT: [[HIGHLOW0:%.+]] = arith.shrui [[LOW0]], [[C16]] : i32
				// CHECK-NEXT: [[LOWHIGH0:%.+]] = arith.andi [[HIGH0]], [[MASK]] : i32
				MogballUnsubmitted Done Reply Inline Actions it would be nice to have "integration tests" for some of these expansions. Verifying the generated code for correctness is kind of difficult if it's this long. Having some cases with `mlir-cpu-runner` that produce the expected output would boost confidence that the implementation is correct (instead of me having to comb through all this code :( ) Mogball: it would be nice to have "integration tests" for some of these expansions. Verifying the…
				kuharAuthorUnsubmitted Done Reply Inline Actions That's a very good point and something I had a couple of attempts at. Coming up with these emulation patterns is not trivial to me, which made me consider a few approaches to gain confidence in the implementation: Convert to the LLVM Dialect and then to LLVM IR. Verify that the emulated op matches the wide one, using Alive 2. Unfortunately, this did not scale for me and I would get timeouts when trying to validate emulated addi (which was much simpler than `muli` or `shrui`!) Hand-pick a few corner cases, provide input as constants, and rely on the constant folding code to get the final result as constants, which could be checked with LIT. I run into a few issues with missing folds, but even if we add anything that's missing, this still seems fragile to me. This could effectively force the constant folding code to support some CPU-intensive folds that are not useful otherwise. Add runtime integration tests in IREE, based on iree-run-module. I discarded this idea because it would force me add memref support earlier than I would like, and either require some cherry picks for our llvm-project fork, or waiting until llvm-project phabricator patches have landed. And ultimately, this is not very desirable from the perspective of llvm-project itself. Add runtime integration tests based on a bespoke test harness. This is what I ended up implementing to verify the implementation locally, and is available on my fork: https://github.com/kuhar/llvm-project/tree/arith_addi_carry/jakub. This is way too hacky to be upstreamed though. Checking all 16-bit input pairs takes around ~30s on my machine (on a single thread). I haven't looked into `mlir-cpu-runner` yet, I'll see what it would take to make it work and be upstreamable. Thanks for the suggestion! kuhar: That's a very good point and something I had a couple of attempts at. Coming up with these…
				// CHECK-NEXT: [[HIGHHIGH0:%.+]] = arith.shrui [[HIGH0]], [[C16]] : i32
				// CHECK-NEXT: [[LOWLOW1:%.+]] = arith.andi [[LOW1]], [[MASK]] : i32
				// CHECK-NEXT: [[HIGHLOW1:%.+]] = arith.shrui [[LOW1]], [[C16]] : i32
				// CHECK-NEXT: [[LOWHIGH1:%.+]] = arith.andi [[HIGH1]], [[MASK]] : i32
				// CHECK-NEXT: [[HIGHHIGH1:%.+]] = arith.shrui [[HIGH1]], [[C16]] : i32
				//
				// CHECK-DAG: {{%.+}} = arith.muli [[LOWLOW0]], [[LOWLOW1]] : i32
				// CHECK-DAG {{%.+}} = arith.muli [[LOWLOW0]], [[HIGHLOW1]] : i32
				// CHECK-DAG: {{%.+}} = arith.muli [[LOWLOW0]], [[LOWHIGH1]] : i32
				// CHECK-DAG: {{%.+}} = arith.muli [[LOWLOW0]], [[HIGHHIGH1]] : i32
				//
				// CHECK-DAG: {{%.+}} = arith.muli [[HIGHLOW0]], [[LOWLOW1]] : i32
				// CHECK-DAG: {{%.+}} = arith.muli [[HIGHLOW0]], [[HIGHLOW1]] : i32
				// CHECK-DAG: {{%.+}} = arith.muli [[HIGHLOW0]], [[LOWHIGH1]] : i32
				//
				// CHECK-DAG: {{%.+}} = arith.muli [[LOWHIGH0]], [[LOWLOW1]] : i32
				// CHECK-DAG: {{%.+}} = arith.muli [[LOWHIGH0]], [[HIGHLOW1]] : i32
				//
				// CHECK-DAG: {{%.+}} = arith.muli [[HIGHHIGH0]], [[LOWLOW1]] : i32
				//
				// CHECK: [[RESHIGH0:%.+]] = arith.shli {{%.+}}, [[C16]] : i32
				// CHECK-NEXT: [[RES0:%.+]] = arith.ori {{%.+}}, [[RESHIGH0]] : i32
				// CHECK-NEXT: [[RESHIGH1:%.+]] = arith.shli {{%.+}}, [[C16]] : i32
				// CHECK-NEXT: [[RES1:%.+]] = arith.ori {{%.+}}, [[RESHIGH1]] : i32
				// CHECK-NEXT: [[VZ:%.+]] = arith.constant dense<0> : vector<2xi32>
				// CHECK-NEXT: [[INS0:%.+]] = vector.insert [[RES0]], [[VZ]] [0] : i32 into vector<2xi32>
				// CHECK-NEXT: [[INS1:%.+]] = vector.insert [[RES1]], [[INS0]] [1] : i32 into vector<2xi32>
				// CHECK-NEXT: return [[INS1]] : vector<2xi32>
				func.func @muli_scalar(%a : i64, %b : i64) -> i64 {
				%m = arith.muli %a, %b : i64
				return %m : i64
				}

				// CHECK-LABEL: func.func @muli_vector
				// CHECK-SAME: ({{%.+}}: vector<3x2xi32>, {{%.+}}: vector<3x2xi32>) -> vector<3x2xi32>
				// CHECK: return {{%.+}} : vector<3x2xi32>
				func.func @muli_vector(%a : vector<3xi64>, %b : vector<3xi64>) -> vector<3xi64> {
				%m = arith.muli %a, %b : vector<3xi64>
				return %m : vector<3xi64>
				}

mlir/test/Integration/Dialect/Arithmetic/CPU/test-wide-int-emulation-muli-i16.mlir

This file was added.

				// Check that the wide integer multiplication emulation produces the same result as wide
				// multiplication. Emulate i16 ops with i8 ops.

				// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
				// RUN: --convert-func-to-llvm --convert-arith-to-llvm \| \
				// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
				// RUN: --shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext \| \
				// RUN: FileCheck %s --match-full-lines --check-prefix=WIDE

				// RUN: mlir-opt %s --arith-emulate-wide-int="widest-int-supported=8" \
				// RUN: --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
				// RUN: --convert-func-to-llvm --convert-arith-to-llvm \| \
				// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
				// RUN: --shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext \| \
				// RUN: FileCheck %s --match-full-lines --check-prefix=EMULATED

				func.func @check_muli(%lhs : i16, %rhs : i16) -> () {
				%res = arith.muli %lhs, %rhs : i16
				vector.print %res : i16
				return
				}

				func.func @entry() {
				%cst0 = arith.constant 0 : i16
				%cst1 = arith.constant 1 : i16
				%cst_1 = arith.constant -1 : i16
				%cst_3 = arith.constant -3 : i16

				%cst13 = arith.constant 13 : i16
				%cst37 = arith.constant 37 : i16
				%cst42 = arith.constant 42 : i16

				%cst256 = arith.constant 256 : i16
				%cst_i16_max = arith.constant 32767 : i16
				%cst_i16_min = arith.constant -32768 : i16

				// WIDE: 0
				// EMULATED: ( 0, 0 )
				func.call @check_muli(%cst0, %cst0) : (i16, i16) -> ()
				// WIDE-NEXT: 0
				// EMULATED-NEXT: ( 0, 0 )
				func.call @check_muli(%cst0, %cst1) : (i16, i16) -> ()
				// WIDE-NEXT: 1
				// EMULATED-NEXT: ( 1, 0 )
				func.call @check_muli(%cst1, %cst1) : (i16, i16) -> ()
				// WIDE-NEXT: -1
				// EMULATED-NEXT: ( -1, -1 )
				func.call @check_muli(%cst1, %cst_1) : (i16, i16) -> ()
				// WIDE-NEXT: 1
				// EMULATED-NEXT: ( 1, 0 )
				func.call @check_muli(%cst_1, %cst_1) : (i16, i16) -> ()
				// WIDE-NEXT: -3
				// EMULATED-NEXT: ( -3, -1 )
				func.call @check_muli(%cst1, %cst_3) : (i16, i16) -> ()

				// WIDE-NEXT: 169
				// EMULATED-NEXT: ( -87, 0 )
				func.call @check_muli(%cst13, %cst13) : (i16, i16) -> ()
				// WIDE-NEXT: 481
				// EMULATED-NEXT: ( -31, 1 )
				func.call @check_muli(%cst13, %cst37) : (i16, i16) -> ()
				// WIDE-NEXT: 1554
				// EMULATED-NEXT: ( 18, 6 )
				func.call @check_muli(%cst37, %cst42) : (i16, i16) -> ()

				// WIDE-NEXT: -256
				// EMULATED-NEXT: ( 0, -1 )
				func.call @check_muli(%cst_1, %cst256) : (i16, i16) -> ()
				// WIDE-NEXT: 3328
				// EMULATED-NEXT: ( 0, 13 )
				func.call @check_muli(%cst256, %cst13) : (i16, i16) -> ()
				// WIDE-NEXT: 9472
				// EMULATED-NEXT: ( 0, 37 )
				func.call @check_muli(%cst256, %cst37) : (i16, i16) -> ()
				// WIDE-NEXT: -768
				// EMULATED-NEXT: ( 0, -3 )
				func.call @check_muli(%cst256, %cst_3) : (i16, i16) -> ()

				// WIDE-NEXT: 32755
				// EMULATED-NEXT: ( -13, 127 )
				func.call @check_muli(%cst13, %cst_i16_max) : (i16, i16) -> ()
				// WIDE-NEXT: -32768
				// EMULATED-NEXT: ( 0, -128 )
				func.call @check_muli(%cst_i16_min, %cst37) : (i16, i16) -> ()

				// WIDE-NEXT: 1
				// EMULATED-NEXT: ( 1, 0 )
				func.call @check_muli(%cst_i16_max, %cst_i16_max) : (i16, i16) -> ()
				// WIDE-NEXT: -32768
				// EMULATED-NEXT: ( 0, -128 )
				func.call @check_muli(%cst_i16_min, %cst13) : (i16, i16) -> ()
				// WIDE-NEXT: 0
				// EMULATED-NEXT: ( 0, 0 )
				func.call @check_muli(%cst_i16_min, %cst_i16_min) : (i16, i16) -> ()

				return
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][arith] Support wide integer multiplication emulation
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460790

mlir/lib/Dialect/Arithmetic/Transforms/EmulateWideInt.cpp

mlir/test/Dialect/Arithmetic/emulate-wide-int-very-wide.mlir

mlir/test/Dialect/Arithmetic/emulate-wide-int.mlir

mlir/test/Integration/Dialect/Arithmetic/CPU/test-wide-int-emulation-muli-i16.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][arith] Support wide integer multiplication emulationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460790

mlir/lib/Dialect/Arithmetic/Transforms/EmulateWideInt.cpp

mlir/test/Dialect/Arithmetic/emulate-wide-int-very-wide.mlir

mlir/test/Dialect/Arithmetic/emulate-wide-int.mlir

mlir/test/Integration/Dialect/Arithmetic/CPU/test-wide-int-emulation-muli-i16.mlir

[mlir][arith] Support wide integer multiplication emulation
ClosedPublic