diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -645,6 +645,20 @@ let constructor = "tosa::createTosaToLinalg()"; } +//===----------------------------------------------------------------------===// +// TosaToLinalgNamed +//===----------------------------------------------------------------------===// + +def TosaToLinalgNamed : FunctionPass<"tosa-to-linalg-named"> { + let summary = "Lower TOSA to LinAlg named operations"; + let description = [{ + Pass that converts TOSA operations to the equivalent operations using the + Linalg named operations. + }]; + + let constructor = "tosa::createTosaToLinalgNamed()"; +} + //===----------------------------------------------------------------------===// // TosaToSCF //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h --- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h +++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h @@ -20,6 +20,7 @@ namespace tosa { std::unique_ptr createTosaToLinalg(); +std::unique_ptr createTosaToLinalgNamed(); /// Populates passes to convert from TOSA to Linalg on buffers. At the end of /// the pass, the function will only contain linalg ops or standard ops if the @@ -29,6 +30,9 @@ /// Populates conversion passes from TOSA dialect to Linalg dialect. void populateTosaToLinalgConversionPatterns(RewritePatternSet *patterns); +/// Populates conversion passes from TOSA dialect to Linalg named operations. +void populateTosaToLinalgNamedConversionPatterns(RewritePatternSet *patterns); + } // namespace tosa } // namespace mlir diff --git a/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt b/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt --- a/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt +++ b/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt @@ -1,5 +1,7 @@ add_mlir_conversion_library(MLIRTosaToLinalg TosaToLinalg.cpp + TosaToLinalgNamed.cpp + TosaToLinalgNamedPass.cpp TosaToLinalgPass.cpp ADDITIONAL_HEADER_DIRS diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -61,37 +61,6 @@ return rewriter.create(loc, largerThanMax, max, minOrArg); } -static mlir::Value applyPad(Location loc, Value input, ArrayRef pad, - Attribute padAttr, OpBuilder &rewriter) { - // Input should be padded if necessary. - if (llvm::all_of(pad, [](int64_t p) { return p == 0; })) - return input; - - ShapedType inputTy = input.getType().cast(); - Type inputETy = inputTy.getElementType(); - auto inputShape = inputTy.getShape(); - - assert((inputShape.size() * 2) == pad.size()); - - SmallVector paddedShape; - SmallVector lowIndices; - SmallVector highIndices; - for (int i = 0, s = inputShape.size(); i < s; i++) { - auto lowPad = pad[i * 2]; - auto highPad = pad[i * 2 + 1]; - paddedShape.push_back(inputShape[i] + highPad + lowPad); - lowIndices.push_back(rewriter.getIndexAttr(lowPad)); - highIndices.push_back(rewriter.getIndexAttr(highPad)); - } - - Value padValue = rewriter.create(loc, padAttr); - - return linalg::PadTensorOp::createPadScalarOp( - RankedTensorType::get(paddedShape, inputETy), input, padValue, - lowIndices, highIndices, /*nofold=*/false, loc, rewriter) - .result(); -} - static SmallVector filterDynamicDims(SmallVector dynDims) { SmallVector filteredDims; for (auto dim : dynDims) @@ -1065,510 +1034,6 @@ } }; -class ConvConverter : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(tosa::Conv2DOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const final { - Location loc = op->getLoc(); - Value input = op->getOperand(0); - Value weight = op->getOperand(1); - Value bias = op->getOperand(2); - - ShapedType inputTy = input.getType().cast(); - ShapedType weightTy = weight.getType().cast(); - ShapedType biasTy = bias.getType().cast(); - ShapedType resultTy = op->getResult(0).getType().cast(); - - Type inputETy = inputTy.getElementType(); - Type resultETy = resultTy.getElementType(); - - auto padAttr = op->getAttr("pad").cast(); - auto strideTosaAttr = op->getAttr("stride").cast(); - auto dilationTosaAttr = op->getAttr("dilation").cast(); - bool isQuantized = op->hasAttr("quantization_info"); - - if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || - !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) - return rewriter.notifyMatchFailure(op, - "tosa.conv ops require static shapes"); - - if (inputETy.isUnsignedInteger()) - return rewriter.notifyMatchFailure( - op, "tosa.conv ops does not support unsigned integer input"); - - auto weightShape = weightTy.getShape(); - - // Apply padding as necessary. - Attribute zeroAttr = rewriter.getZeroAttr(inputETy); - if (isQuantized) { - auto quantizationInfo = - op->getAttr("quantization_info").cast(); - auto iZp = quantizationInfo.input_zp().getValue().getSExtValue(); - - int64_t intMin = - APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth()) - .getSExtValue(); - int64_t intMax = - APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth()) - .getSExtValue(); - - if (iZp < intMin || iZp > intMax) - return rewriter.notifyMatchFailure( - op, "tosa.conv op quantization has zp outside of input range"); - - zeroAttr = rewriter.getIntegerAttr(inputETy, iZp); - } - - llvm::SmallVector pad; - pad.resize(2, 0); - getValuesFromIntArrayAttribute(padAttr, pad); - pad.resize(pad.size() + 2, 0); - input = applyPad(loc, input, pad, zeroAttr, rewriter); - - // Transpose the kernel to match dimension ordering of the linalg - // convolution operation. - // TODO(suderman): See if this can be efficiently folded - check whether - // the input is used anywhere else, if not fold the constant. - SmallVector weightPerm{1, 2, 3, 0}; - SmallVector newWeightShape{weightShape[1], weightShape[2], - weightShape[3], weightShape[0]}; - auto weightPermAttr = DenseIntElementsAttr::get( - RankedTensorType::get({4}, rewriter.getI64Type()), weightPerm); - Value weightPermValue = - rewriter.create(loc, weightPermAttr); - Type newWeightTy = - RankedTensorType::get(newWeightShape, weightTy.getElementType()); - weight = rewriter.create(loc, newWeightTy, weight, - weightPermValue); - - Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy); - Value initTensor = rewriter.create( - loc, resultTy.getShape(), resultETy); - Value zero = rewriter.create(loc, resultZeroAttr); - Value zeroTensor = - rewriter.create(loc, zero, initTensor).getResult(0); - - // Extract the attributes for convolution. - llvm::SmallVector stride, dilation; - getValuesFromIntArrayAttribute(strideTosaAttr, stride); - getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); - - // Create the convolution op. - auto strideAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), stride); - auto dilationAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), dilation); - - // Create maps for the bias broadcasting - SmallVector indexingMaps; - indexingMaps.push_back(AffineMap::get( - /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, - {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); - - Value biasInitTensor = rewriter.create( - loc, resultTy.getShape(), resultETy); - - if (isQuantized) { - auto quantizationInfo = - op->getAttr("quantization_info").cast(); - auto iZp = rewriter.getI32IntegerAttr( - quantizationInfo.input_zp().getValue().getSExtValue()); - auto kZp = rewriter.getI32IntegerAttr( - quantizationInfo.weight_zp().getValue().getSExtValue()); - - auto iZpVal = rewriter.create(loc, iZp); - auto kZpVal = rewriter.create(loc, kZp); - Value conv = - rewriter - .create( - loc, resultTy, ValueRange{input, weight, iZpVal, kZpVal}, - ValueRange{zeroTensor}, strideAttr, dilationAttr) - ->getResult(0); - - Value result = - rewriter - .create( - loc, resultTy, ValueRange({bias, conv}), biasInitTensor, - indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - Value added = nestedBuilder.create( - loc, args[0], args[1]); - nestedBuilder.create(nestedLoc, added); - }) - .getResult(0); - rewriter.replaceOp(op, result); - return success(); - } - - Value conv = rewriter - .create( - loc, resultTy, ValueRange{input, weight}, - ValueRange{zeroTensor}, strideAttr, dilationAttr) - ->getResult(0); - - Value result = - rewriter - .create( - loc, resultTy, ValueRange({bias, conv}), biasInitTensor, - indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - Value added = nestedBuilder.create( - loc, args[0], args[1]); - nestedBuilder.create(nestedLoc, added); - }) - .getResult(0); - - rewriter.replaceOp(op, result); - return success(); - } -}; - -class DepthwiseConvConverter - : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(tosa::DepthwiseConv2DOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const final { - Location loc = op->getLoc(); - Value input = op->getOperand(0); - Value weight = op->getOperand(1); - Value bias = op->getOperand(2); - - ShapedType inputTy = input.getType().cast(); - ShapedType weightTy = weight.getType().cast(); - ShapedType biasTy = bias.getType().cast(); - ShapedType resultTy = op->getResult(0).getType().cast(); - - Type inputETy = inputTy.getElementType(); - Type resultETy = resultTy.getElementType(); - - auto padAttr = op->getAttr("pad").cast(); - auto strideTosaAttr = op->getAttr("stride").cast(); - auto dilationTosaAttr = op->getAttr("dilation").cast(); - - bool isQuantized = op->hasAttr("quantization_info"); - IntegerAttr iZp; - IntegerAttr kZp; - if (isQuantized) { - auto quantizationInfo = - op->getAttr("quantization_info").cast(); - iZp = rewriter.getI32IntegerAttr( - quantizationInfo.input_zp().getValue().getSExtValue()); - kZp = rewriter.getI32IntegerAttr( - quantizationInfo.weight_zp().getValue().getSExtValue()); - } - - if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || - !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) - return rewriter.notifyMatchFailure(op, - "tosa.conv ops require static shapes"); - - auto weightShape = weightTy.getShape(); - auto resultShape = resultTy.getShape(); - - // Apply padding as necessary. - Attribute zeroAttr = rewriter.getZeroAttr(inputETy); - if (isQuantized) { - auto quantizationInfo = - op->getAttr("quantization_info").cast(); - auto iZp = quantizationInfo.input_zp().getValue().getSExtValue(); - - int64_t intMin = - APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth()) - .getSExtValue(); - int64_t intMax = - APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth()) - .getSExtValue(); - - if (iZp < intMin || iZp > intMax) - return rewriter.notifyMatchFailure( - op, "tosa.depthwise_conv op quantization has zp outside of input " - "range"); - - zeroAttr = rewriter.getIntegerAttr(inputETy, iZp); - } - - llvm::SmallVector pad; - pad.resize(2, 0); - getValuesFromIntArrayAttribute(padAttr, pad); - pad.resize(pad.size() + 2, 0); - - input = applyPad(loc, input, pad, zeroAttr, rewriter); - - // Extract the attributes for convolution. - llvm::SmallVector stride, dilation; - getValuesFromIntArrayAttribute(strideTosaAttr, stride); - getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); - - // Create the convolution op. - auto strideAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), stride); - auto dilationAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), dilation); - ShapedType linalgConvTy = - RankedTensorType::get({resultShape[0], resultShape[1], resultShape[2], - weightShape[2], weightShape[3]}, - resultETy); - - // Broadcast the initial value to the output tensor before convolving. - SmallVector indexingMaps; - indexingMaps.push_back(AffineMap::get( - /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, - {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); - - Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy); - Value initTensor = rewriter.create( - loc, linalgConvTy.getShape(), resultETy); - Value zero = rewriter.create(loc, resultZeroAttr); - Value zeroTensor = - rewriter.create(loc, zero, initTensor).getResult(0); - - Value biasInitTensor = rewriter.create( - loc, resultTy.getShape(), resultETy); - if (!isQuantized) { - Value conv = rewriter - .create( - loc, linalgConvTy, ValueRange{input, weight}, - ValueRange{zeroTensor}, strideAttr, dilationAttr) - .getResult(0); - Value convReshape = rewriter.create(loc, resultTy, conv); - Value result = - rewriter - .create( - loc, resultTy, ValueRange({bias, convReshape}), - biasInitTensor, indexingMaps, - getNParallelLoopsAttrs(resultTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - Value added = nestedBuilder.create( - loc, args[0], args[1]); - nestedBuilder.create(nestedLoc, added); - }) - .getResult(0); - rewriter.replaceOp(op, result); - } else { - auto iZpVal = rewriter.create(loc, iZp); - auto kZpVal = rewriter.create(loc, kZp); - Value conv = - rewriter - .create( - loc, linalgConvTy, ValueRange{input, weight, iZpVal, kZpVal}, - ValueRange{zeroTensor}, strideAttr, dilationAttr) - .getResult(0); - Value convReshape = rewriter.create(loc, resultTy, conv); - Value result = - rewriter - .create( - loc, resultTy, ValueRange({bias, convReshape}), - biasInitTensor, indexingMaps, - getNParallelLoopsAttrs(resultTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - Value added = nestedBuilder.create( - loc, args[0], args[1]); - nestedBuilder.create(nestedLoc, added); - }) - .getResult(0); - rewriter.replaceOp(op, result); - } - return success(); - } -}; - -class MatMulConverter : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(tosa::MatMulOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const final { - Location loc = op.getLoc(); - - auto outputTy = op.getType().cast(); - auto outputElementTy = outputTy.getElementType(); - - auto firstOperandTy = op->getOperand(0).getType().cast(); - auto secondOperandTy = op->getOperand(1).getType().cast(); - - SmallVector dynDims; - dynDims.resize(op->getResult(0).getType().cast().getRank()); - - if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(0)) { - dynDims[0] = rewriter.create(loc, op->getOperand(0), 0); - } - - if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(1)) { - dynDims[1] = rewriter.create(loc, op->getOperand(0), 1); - } - - if (!secondOperandTy.hasRank() || secondOperandTy.isDynamicDim(2)) { - dynDims[2] = rewriter.create(loc, op->getOperand(1), 2); - } - - SmallVector filteredDims = filterDynamicDims(dynDims); - - auto zeroAttr = rewriter.getZeroAttr(outputElementTy); - Value zero = rewriter.create(loc, zeroAttr); - auto initTensor = rewriter.create( - loc, filteredDims, outputTy.getShape(), outputTy.getElementType()); - Value zeroTensor = - rewriter.create(loc, zero, initTensor).getResult(0); - if (!op.quantization_info()) { - rewriter.replaceOpWithNewOp( - op, TypeRange{op.getType()}, ValueRange{adaptor.a(), adaptor.b()}, - ValueRange{zeroTensor}); - return success(); - } - - auto quantizationInfo = op.quantization_info().getValue(); - auto aZp = rewriter.create( - loc, rewriter.getI32IntegerAttr( - quantizationInfo.a_zp().getValue().getSExtValue())); - auto bZp = rewriter.create( - loc, rewriter.getI32IntegerAttr( - quantizationInfo.b_zp().getValue().getSExtValue())); - rewriter.replaceOpWithNewOp( - op, TypeRange{op.getType()}, - ValueRange{adaptor.a(), adaptor.b(), aZp, bZp}, zeroTensor); - - return success(); - } -}; - -class FullyConnectedConverter - : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(tosa::FullyConnectedOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const final { - Location loc = op.getLoc(); - auto outputTy = op.getType().cast(); - auto input = op.input(); - auto inputTy = input.getType().cast(); - - auto bias = op.bias(); - - auto weight = op.weight(); - auto weightTy = weight.getType().cast(); - auto weightShape = weightTy.getShape(); - - auto outputETy = outputTy.getElementType(); - - SmallVector dynDims; - dynDims.resize(op->getResult(0).getType().cast().getRank()); - - if (!inputTy.hasRank() || inputTy.isDynamicDim(0)) { - dynDims[0] = rewriter.create(loc, input, 0); - } - - if (!weightTy.hasRank() || weightTy.isDynamicDim(0)) { - dynDims[1] = rewriter.create(loc, weight, 0); - } - - SmallVector filteredDims = filterDynamicDims(dynDims); - - // Creating maps for the output of MatMul and the bias - SmallVector indexingMaps; - - // Broadcast the bias. - indexingMaps.push_back(AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0, - {rewriter.getAffineDimExpr(1)}, - rewriter.getContext())); - - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank())); - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank())); - - auto initTensor = rewriter.create( - loc, filteredDims, outputTy.getShape(), outputTy.getElementType()); - - // When quantized, the input elemeny type is not the same as the output - Attribute resultZeroAttr = rewriter.getZeroAttr(outputETy); - Value zero = rewriter.create(loc, resultZeroAttr); - Value zeroTensor = - rewriter.create(loc, zero, initTensor).getResult(0); - - SmallVector permutation{1, 0}; - auto permutationAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), permutation); - Value permutationValue = - rewriter.create(loc, permutationAttr); - - SmallVector newWeightShape{weightShape[1], weightShape[0]}; - Type newWeightTy = - RankedTensorType::get(newWeightShape, weightTy.getElementType()); - - Value transposedWeight = rewriter.create( - loc, newWeightTy, weight, permutationValue); - - auto biasInitTensor = - rewriter - .create(loc, filteredDims, - outputTy.getShape(), outputETy) - ->getResults(); - - if (!op.quantization_info()) { - Value matmul = rewriter - .create( - loc, TypeRange{op.getType()}, - ValueRange{input, transposedWeight}, zeroTensor) - ->getResult(0); - - Value result = - rewriter - .create( - loc, outputTy, ValueRange({bias, matmul}), biasInitTensor, - indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - Value added = nestedBuilder.create( - loc, args[0], args[1]); - nestedBuilder.create(nestedLoc, added); - }) - .getResult(0); - rewriter.replaceOp(op, result); - return success(); - } - - auto quantizationInfo = op.quantization_info().getValue(); - auto inputZp = rewriter.create( - loc, rewriter.getI32IntegerAttr( - quantizationInfo.input_zp().getValue().getSExtValue())); - auto outputZp = rewriter.create( - loc, rewriter.getI32IntegerAttr( - quantizationInfo.weight_zp().getValue().getSExtValue())); - Value matmul = - rewriter - .create( - loc, TypeRange{op.getType()}, - ValueRange{input, transposedWeight, inputZp, outputZp}, - zeroTensor) - ->getResult(0); - Value result = - rewriter - .create( - loc, outputTy, ValueRange({bias, matmul}), biasInitTensor, - indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - Value added = nestedBuilder.create( - loc, args[0], args[1]); - nestedBuilder.create(nestedLoc, added); - }) - .getResult(0); - rewriter.replaceOp(op, result); - return success(); - } -}; - class ReshapeConverterCollapse : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -2810,277 +2275,6 @@ } }; -class MaxPool2dConverter : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(tosa::MaxPool2dOp op, - PatternRewriter &rewriter) const final { - Location loc = op.getLoc(); - Value input = op.input(); - ShapedType inputTy = input.getType().cast(); - - ShapedType resultTy = op.getType().template cast(); - Type resultETy = inputTy.getElementType(); - - if (!inputTy.hasStaticShape()) - return failure(); - - // Determine what the initial value needs to be for the max pool op. - Attribute initialAttr; - if (resultETy.isF32()) - initialAttr = rewriter.getFloatAttr( - resultETy, - APFloat::getLargest(resultETy.cast().getFloatSemantics(), - true)); - - if (resultETy.isa()) - initialAttr = rewriter.getIntegerAttr( - resultETy, - APInt::getSignedMinValue(resultETy.getIntOrFloatBitWidth())); - - if (!initialAttr) - return rewriter.notifyMatchFailure( - op, "Unsupported initial value for tosa.maxpool_2d op"); - - // Apply padding as necessary. - llvm::SmallVector pad; - pad.resize(2, 0); - getValuesFromIntArrayAttribute(op.pad(), pad); - pad.resize(pad.size() + 2, 0); - Value paddedInput = applyPad(loc, input, pad, initialAttr, rewriter); - - Value initialValue = rewriter.create(loc, initialAttr); - - SmallVector kernel, stride; - getValuesFromIntArrayAttribute(op.kernel(), kernel); - getValuesFromIntArrayAttribute(op.stride(), stride); - - Attribute strideAttr = rewriter.getI64VectorAttr(stride); - Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1}); - - // Create the linalg op that performs pooling. - Value initTensor = rewriter.create( - loc, resultTy.getShape(), resultTy.getElementType()); - - Value filledInitTensor = - rewriter.create(loc, initialValue, initTensor).result(); - - Value fakeWindowDims = - rewriter.create(loc, kernel, resultETy); - - rewriter.replaceOpWithNewOp( - op, ArrayRef{resultTy}, ValueRange{paddedInput, fakeWindowDims}, - filledInitTensor, strideAttr, dilationAttr); - return success(); - } -}; - -class AvgPool2dConverter : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(tosa::AvgPool2dOp op, - PatternRewriter &rewriter) const final { - Location loc = op.getLoc(); - Value input = op.input(); - ShapedType inputTy = input.getType().cast(); - Type inElementTy = inputTy.getElementType(); - - ShapedType resultTy = op.getType().template cast(); - Type resultETy = op.getType().cast().getElementType(); - - Type accETy = - inElementTy.isa() ? rewriter.getI32Type() : inElementTy; - ShapedType accTy = resultTy.clone(accETy); - - if (!inputTy.hasStaticShape()) - return failure(); - - // Apply padding as necessary. - llvm::SmallVector pad; - pad.resize(2, 0); - getValuesFromIntArrayAttribute(op.pad(), pad); - pad.resize(pad.size() + 2, 0); - Attribute padAttr = rewriter.getZeroAttr(inElementTy); - Value paddedInput = applyPad(loc, input, pad, padAttr, rewriter); - - Attribute initialAttr = rewriter.getZeroAttr(accETy); - Value initialValue = rewriter.create(loc, initialAttr); - - SmallVector kernel, stride; - getValuesFromIntArrayAttribute(op.kernel(), kernel); - getValuesFromIntArrayAttribute(op.stride(), stride); - - Attribute strideAttr = rewriter.getI64VectorAttr(stride); - Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1}); - - // Create the linalg op that performs pooling. - Value poolInitTensor = - rewriter.create(loc, accTy.getShape(), accETy); - - Value filledInitTensor = - rewriter.create(loc, initialValue, poolInitTensor) - .result(); - - Value fakeWindowDims = - rewriter.create(loc, kernel, accETy); - - // Sum across the pooled region. - Value poolingOp = rewriter - .create( - loc, ArrayRef{accTy}, - ValueRange{paddedInput, fakeWindowDims}, - filledInitTensor, strideAttr, dilationAttr) - .getResult(0); - - // Normalize the summed value by the number of elements grouped in each - // pool. - auto poolingOpTy = poolingOp.getType().cast(); - auto affineMap = rewriter.getMultiDimIdentityMap(resultTy.getRank()); - - Value genericInitTensor = rewriter.create( - loc, resultTy.getShape(), resultETy); - - auto genericOp = rewriter.create( - loc, ArrayRef({resultTy}), ValueRange{poolingOp}, - ValueRange{genericInitTensor}, - ArrayRef({affineMap, affineMap}), - getNParallelLoopsAttrs(resultTy.getRank()), - [&](OpBuilder &b, Location loc, ValueRange args) { - auto zero = rewriter.create(loc, 0); - auto one = rewriter.create(loc, 1); - auto iH = rewriter.create( - loc, poolingOpTy.getDimSize(1) - 1); - auto iW = rewriter.create( - loc, poolingOpTy.getDimSize(2) - 1); - - // Compute the indices from either end. - auto y0 = rewriter.create(loc, 1); - auto x0 = rewriter.create(loc, 2); - auto y1 = rewriter.create(loc, iH, y0); - auto x1 = rewriter.create(loc, iW, x0); - - // Determines what the portion of valid input is covered by the - // kernel. - auto padFn = [&](Value v, Value x, int64_t pad) -> Value { - if (pad == 0) - return v; - - auto padVal = rewriter.create(loc, pad); - Value dx = rewriter.create(loc, x, padVal); - - Value cmp = rewriter.create( - loc, arith::CmpIPredicate::slt, dx, zero); - Value offset = rewriter.create(loc, cmp, dx, zero); - return rewriter.create(loc, v, offset)->getResult(0); - }; - - // Compute the vertical component of coverage. - auto kH0 = rewriter.create(loc, kernel[0]); - auto kH1 = padFn(kH0, y0, pad[2]); - auto kH2 = padFn(kH1, y1, pad[3]); - auto kHCmp = rewriter.create( - loc, arith::CmpIPredicate::slt, kH2, one); - auto kH3 = rewriter.create(loc, kHCmp, one, kH2); - - // compute the horizontal component of coverage. - auto kW0 = rewriter.create(loc, kernel[1]); - auto kW1 = padFn(kW0, x0, pad[4]); - auto kW2 = padFn(kW1, x1, pad[5]); - auto kWCmp = rewriter.create( - loc, arith::CmpIPredicate::slt, kW2, one); - auto kW3 = rewriter.create(loc, kWCmp, one, kW2); - - // Compute the total number of elements and normalize. - Value count = rewriter.create(loc, kH3, kW3); - auto countI = rewriter.create( - loc, rewriter.getI32Type(), count); - - // Divide by the number of summed values. For floats this is just - // a div however for quantized values input normalization had - // to be applied. - Value poolVal = args[0]; - if (accETy.isa()) { - auto countF = rewriter.create(loc, accETy, countI); - poolVal = rewriter.create(loc, poolVal, countF) - ->getResult(0); - } else { - - // If we have quantization information we need to apply an offset - // for the input zp value. - if (op.quantization_info()) { - auto quantizationInfo = op.quantization_info().getValue(); - auto inputZp = rewriter.create( - loc, quantizationInfo.input_zp()); - Value offset = - rewriter.create(loc, accETy, countI, inputZp); - poolVal = - rewriter.create(loc, accETy, poolVal, offset); - } - - // Compute the multiplier and shift values for the quantization - // normalization. Preferably we would want to compute more bits - // however 32-bits should be enough for compute. Honestly we - // should probably straight divide. - int64_t numerator = ((1 << 30) + 1); - int64_t shift = 30; - - Value numeratorVal = rewriter.create( - loc, rewriter.getI32IntegerAttr(numerator)); - Value multiplierVal = - rewriter - .create(loc, rewriter.getI32Type(), - numeratorVal, countI) - .getResult(); - Value shiftVal = rewriter.create( - loc, rewriter.getI8IntegerAttr(shift)); - - auto scaled = - rewriter - .create( - loc, rewriter.getI32Type(), poolVal, multiplierVal, - shiftVal, rewriter.getBoolAttr(false)) - .getResult(); - - // If we have quantization information we need to apply output - // zeropoint. - if (op.quantization_info()) { - auto quantizationInfo = op.quantization_info().getValue(); - auto outputZp = rewriter.create( - loc, quantizationInfo.output_zp()); - scaled = rewriter.create(loc, scaled, outputZp) - .getResult(); - } - - // Apply Clip. - int64_t outBitwidth = resultETy.getIntOrFloatBitWidth(); - - auto min = rewriter.create( - loc, APInt::getSignedMinValue(outBitwidth).getSExtValue(), - accETy); - auto max = rewriter.create( - loc, APInt::getSignedMaxValue(outBitwidth).getSExtValue(), - accETy); - auto clamp = clampHelper( - loc, scaled, min, max, arith::CmpIPredicate::slt, rewriter); - - poolVal = clamp; - // Convert type. - if (resultETy != clamp.getType()) { - poolVal = - rewriter.create(loc, resultETy, poolVal); - } - } - - rewriter.create(loc, poolVal); - }); - - rewriter.replaceOp(op, genericOp.getResult(0)); - return success(); - } -}; - } // namespace void mlir::tosa::populateTosaToLinalgConversionPatterns( @@ -3132,8 +2326,6 @@ ReduceConverter, ArgMaxConverter, ConcatConverter, - ConvConverter, - DepthwiseConvConverter, GatherConverter, PadConverter, ReshapeConverterCollapse, @@ -3144,10 +2336,6 @@ ReverseConverter, TableConverter, TileConverter, - TransposeConverter, - MatMulConverter, - MaxPool2dConverter, - AvgPool2dConverter, - FullyConnectedConverter>(patterns->getContext()); + TransposeConverter>(patterns->getContext()); // clang-format on } diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp @@ -0,0 +1,885 @@ +//===- TosaToLinalgNamed.cpp - Lowering Tosa to Linalg Named Ops ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// These rewriters lower from the Tosa to the Linalg named ops. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" +#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/SCF/SCF.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tosa/IR/TosaOps.h" +#include "mlir/Dialect/Utils/ReshapeOpsUtils.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include + +using namespace mlir; + +static SmallVector getNParallelLoopsAttrs(unsigned nParallelLoops) { + return SmallVector(nParallelLoops, getParallelIteratorTypeName()); +} + +template +static void getValuesFromIntArrayAttribute(ArrayAttr attr, + SmallVector &arrayValues) { + for (Attribute val : attr.getValue()) { + arrayValues.push_back(val.cast().getValue().getSExtValue()); + } +} + +template +static mlir::SelectOp clampHelper(Location loc, Value arg, + arith::ConstantOp min, arith::ConstantOp max, + P pred, OpBuilder &rewriter) { + auto smallerThanMin = rewriter.create(loc, pred, arg, min); + auto minOrArg = + rewriter.create(loc, smallerThanMin, min, arg); + auto largerThanMax = rewriter.create(loc, pred, max, arg); + return rewriter.create(loc, largerThanMax, max, minOrArg); +} + +static mlir::Value applyPad(Location loc, Value input, ArrayRef pad, + Attribute padAttr, OpBuilder &rewriter) { + // Input should be padded if necessary. + if (llvm::all_of(pad, [](int64_t p) { return p == 0; })) + return input; + + ShapedType inputTy = input.getType().cast(); + Type inputETy = inputTy.getElementType(); + auto inputShape = inputTy.getShape(); + + assert((inputShape.size() * 2) == pad.size()); + + SmallVector paddedShape; + SmallVector lowIndices; + SmallVector highIndices; + for (int i = 0, s = inputShape.size(); i < s; i++) { + auto lowPad = pad[i * 2]; + auto highPad = pad[i * 2 + 1]; + paddedShape.push_back(inputShape[i] + highPad + lowPad); + lowIndices.push_back(rewriter.getIndexAttr(lowPad)); + highIndices.push_back(rewriter.getIndexAttr(highPad)); + } + + Value padValue = rewriter.create(loc, padAttr); + + return linalg::PadTensorOp::createPadScalarOp( + RankedTensorType::get(paddedShape, inputETy), input, padValue, + lowIndices, highIndices, /*nofold=*/false, loc, rewriter) + .result(); +} + +static SmallVector filterDynamicDims(SmallVector dynDims) { + SmallVector filteredDims; + for (auto dim : dynDims) + if (dim) + filteredDims.push_back(dim); + return filteredDims; +} + +namespace { + +class ConvConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(tosa::Conv2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + Location loc = op->getLoc(); + Value input = op->getOperand(0); + Value weight = op->getOperand(1); + Value bias = op->getOperand(2); + + ShapedType inputTy = input.getType().cast(); + ShapedType weightTy = weight.getType().cast(); + ShapedType biasTy = bias.getType().cast(); + ShapedType resultTy = op->getResult(0).getType().cast(); + + Type inputETy = inputTy.getElementType(); + Type resultETy = resultTy.getElementType(); + + auto padAttr = op->getAttr("pad").cast(); + auto strideTosaAttr = op->getAttr("stride").cast(); + auto dilationTosaAttr = op->getAttr("dilation").cast(); + bool isQuantized = op->hasAttr("quantization_info"); + + if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || + !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) + return rewriter.notifyMatchFailure(op, + "tosa.conv ops require static shapes"); + + if (inputETy.isUnsignedInteger()) + return rewriter.notifyMatchFailure( + op, "tosa.conv ops does not support unsigned integer input"); + + auto weightShape = weightTy.getShape(); + + // Apply padding as necessary. + Attribute zeroAttr = rewriter.getZeroAttr(inputETy); + if (isQuantized) { + auto quantizationInfo = + op->getAttr("quantization_info").cast(); + auto iZp = quantizationInfo.input_zp().getValue().getSExtValue(); + + int64_t intMin = + APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth()) + .getSExtValue(); + int64_t intMax = + APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth()) + .getSExtValue(); + + if (iZp < intMin || iZp > intMax) + return rewriter.notifyMatchFailure( + op, "tosa.conv op quantization has zp outside of input range"); + + zeroAttr = rewriter.getIntegerAttr(inputETy, iZp); + } + + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(padAttr, pad); + pad.resize(pad.size() + 2, 0); + input = applyPad(loc, input, pad, zeroAttr, rewriter); + + // Transpose the kernel to match dimension ordering of the linalg + // convolution operation. + // TODO(suderman): See if this can be efficiently folded - check whether + // the input is used anywhere else, if not fold the constant. + SmallVector weightPerm{1, 2, 3, 0}; + SmallVector newWeightShape{weightShape[1], weightShape[2], + weightShape[3], weightShape[0]}; + auto weightPermAttr = DenseIntElementsAttr::get( + RankedTensorType::get({4}, rewriter.getI64Type()), weightPerm); + Value weightPermValue = + rewriter.create(loc, weightPermAttr); + Type newWeightTy = + RankedTensorType::get(newWeightShape, weightTy.getElementType()); + weight = rewriter.create(loc, newWeightTy, weight, + weightPermValue); + + Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy); + Value initTensor = rewriter.create( + loc, resultTy.getShape(), resultETy); + Value zero = rewriter.create(loc, resultZeroAttr); + Value zeroTensor = + rewriter.create(loc, zero, initTensor).getResult(0); + + // Extract the attributes for convolution. + llvm::SmallVector stride, dilation; + getValuesFromIntArrayAttribute(strideTosaAttr, stride); + getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); + + // Create the convolution op. + auto strideAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), stride); + auto dilationAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), dilation); + + // Create maps for the bias broadcasting + SmallVector indexingMaps; + indexingMaps.push_back(AffineMap::get( + /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, + {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + + Value biasInitTensor = rewriter.create( + loc, resultTy.getShape(), resultETy); + + if (isQuantized) { + auto quantizationInfo = + op->getAttr("quantization_info").cast(); + auto iZp = rewriter.getI32IntegerAttr( + quantizationInfo.input_zp().getValue().getSExtValue()); + auto kZp = rewriter.getI32IntegerAttr( + quantizationInfo.weight_zp().getValue().getSExtValue()); + + auto iZpVal = rewriter.create(loc, iZp); + auto kZpVal = rewriter.create(loc, kZp); + Value conv = + rewriter + .create( + loc, resultTy, ValueRange{input, weight, iZpVal, kZpVal}, + ValueRange{zeroTensor}, strideAttr, dilationAttr) + ->getResult(0); + + Value result = + rewriter + .create( + loc, resultTy, ValueRange({bias, conv}), biasInitTensor, + indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + Value added = nestedBuilder.create( + loc, args[0], args[1]); + nestedBuilder.create(nestedLoc, added); + }) + .getResult(0); + rewriter.replaceOp(op, result); + return success(); + } + + Value conv = rewriter + .create( + loc, resultTy, ValueRange{input, weight}, + ValueRange{zeroTensor}, strideAttr, dilationAttr) + ->getResult(0); + + Value result = + rewriter + .create( + loc, resultTy, ValueRange({bias, conv}), biasInitTensor, + indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + Value added = nestedBuilder.create( + loc, args[0], args[1]); + nestedBuilder.create(nestedLoc, added); + }) + .getResult(0); + + rewriter.replaceOp(op, result); + return success(); + } +}; + +class DepthwiseConvConverter + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(tosa::DepthwiseConv2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + Location loc = op->getLoc(); + Value input = op->getOperand(0); + Value weight = op->getOperand(1); + Value bias = op->getOperand(2); + + ShapedType inputTy = input.getType().cast(); + ShapedType weightTy = weight.getType().cast(); + ShapedType biasTy = bias.getType().cast(); + ShapedType resultTy = op->getResult(0).getType().cast(); + + Type inputETy = inputTy.getElementType(); + Type resultETy = resultTy.getElementType(); + + auto padAttr = op->getAttr("pad").cast(); + auto strideTosaAttr = op->getAttr("stride").cast(); + auto dilationTosaAttr = op->getAttr("dilation").cast(); + + bool isQuantized = op->hasAttr("quantization_info"); + IntegerAttr iZp; + IntegerAttr kZp; + if (isQuantized) { + auto quantizationInfo = + op->getAttr("quantization_info").cast(); + iZp = rewriter.getI32IntegerAttr( + quantizationInfo.input_zp().getValue().getSExtValue()); + kZp = rewriter.getI32IntegerAttr( + quantizationInfo.weight_zp().getValue().getSExtValue()); + } + + if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || + !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) + return rewriter.notifyMatchFailure(op, + "tosa.conv ops require static shapes"); + + auto weightShape = weightTy.getShape(); + auto resultShape = resultTy.getShape(); + + // Apply padding as necessary. + Attribute zeroAttr = rewriter.getZeroAttr(inputETy); + if (isQuantized) { + auto quantizationInfo = + op->getAttr("quantization_info").cast(); + auto iZp = quantizationInfo.input_zp().getValue().getSExtValue(); + + int64_t intMin = + APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth()) + .getSExtValue(); + int64_t intMax = + APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth()) + .getSExtValue(); + + if (iZp < intMin || iZp > intMax) + return rewriter.notifyMatchFailure( + op, "tosa.depthwise_conv op quantization has zp outside of input " + "range"); + + zeroAttr = rewriter.getIntegerAttr(inputETy, iZp); + } + + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(padAttr, pad); + pad.resize(pad.size() + 2, 0); + + input = applyPad(loc, input, pad, zeroAttr, rewriter); + + // Extract the attributes for convolution. + llvm::SmallVector stride, dilation; + getValuesFromIntArrayAttribute(strideTosaAttr, stride); + getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); + + // Create the convolution op. + auto strideAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), stride); + auto dilationAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), dilation); + ShapedType linalgConvTy = + RankedTensorType::get({resultShape[0], resultShape[1], resultShape[2], + weightShape[2], weightShape[3]}, + resultETy); + + // Broadcast the initial value to the output tensor before convolving. + SmallVector indexingMaps; + indexingMaps.push_back(AffineMap::get( + /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, + {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + + Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy); + Value initTensor = rewriter.create( + loc, linalgConvTy.getShape(), resultETy); + Value zero = rewriter.create(loc, resultZeroAttr); + Value zeroTensor = + rewriter.create(loc, zero, initTensor).getResult(0); + + Value biasInitTensor = rewriter.create( + loc, resultTy.getShape(), resultETy); + if (!isQuantized) { + Value conv = rewriter + .create( + loc, linalgConvTy, ValueRange{input, weight}, + ValueRange{zeroTensor}, strideAttr, dilationAttr) + .getResult(0); + Value convReshape = rewriter.create( + loc, resultTy, conv, rewriter.getI64ArrayAttr(resultTy.getShape())); + Value result = + rewriter + .create( + loc, resultTy, ValueRange({bias, convReshape}), + biasInitTensor, indexingMaps, + getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + Value added = nestedBuilder.create( + loc, args[0], args[1]); + nestedBuilder.create(nestedLoc, added); + }) + .getResult(0); + rewriter.replaceOp(op, result); + } else { + auto iZpVal = rewriter.create(loc, iZp); + auto kZpVal = rewriter.create(loc, kZp); + Value conv = + rewriter + .create( + loc, linalgConvTy, ValueRange{input, weight, iZpVal, kZpVal}, + ValueRange{zeroTensor}, strideAttr, dilationAttr) + .getResult(0); + Value convReshape = rewriter.create( + loc, resultTy, conv, rewriter.getI64ArrayAttr(resultTy.getShape())); + Value result = + rewriter + .create( + loc, resultTy, ValueRange({bias, convReshape}), + biasInitTensor, indexingMaps, + getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + Value added = nestedBuilder.create( + loc, args[0], args[1]); + nestedBuilder.create(nestedLoc, added); + }) + .getResult(0); + rewriter.replaceOp(op, result); + } + return success(); + } +}; + +class MatMulConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(tosa::MatMulOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + Location loc = op.getLoc(); + + auto outputTy = op.getType().cast(); + auto outputElementTy = outputTy.getElementType(); + + auto firstOperandTy = op->getOperand(0).getType().cast(); + auto secondOperandTy = op->getOperand(1).getType().cast(); + + SmallVector dynDims; + dynDims.resize(op->getResult(0).getType().cast().getRank()); + + if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(0)) { + dynDims[0] = rewriter.create(loc, op->getOperand(0), 0); + } + + if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(1)) { + dynDims[1] = rewriter.create(loc, op->getOperand(0), 1); + } + + if (!secondOperandTy.hasRank() || secondOperandTy.isDynamicDim(2)) { + dynDims[2] = rewriter.create(loc, op->getOperand(1), 2); + } + + SmallVector filteredDims = filterDynamicDims(dynDims); + + auto zeroAttr = rewriter.getZeroAttr(outputElementTy); + Value zero = rewriter.create(loc, zeroAttr); + auto initTensor = rewriter.create( + loc, filteredDims, outputTy.getShape(), outputTy.getElementType()); + Value zeroTensor = + rewriter.create(loc, zero, initTensor).getResult(0); + if (!op.quantization_info()) { + rewriter.replaceOpWithNewOp( + op, TypeRange{op.getType()}, ValueRange{adaptor.a(), adaptor.b()}, + ValueRange{zeroTensor}); + return success(); + } + + auto quantizationInfo = op.quantization_info().getValue(); + auto aZp = rewriter.create( + loc, rewriter.getI32IntegerAttr( + quantizationInfo.a_zp().getValue().getSExtValue())); + auto bZp = rewriter.create( + loc, rewriter.getI32IntegerAttr( + quantizationInfo.b_zp().getValue().getSExtValue())); + rewriter.replaceOpWithNewOp( + op, TypeRange{op.getType()}, + ValueRange{adaptor.a(), adaptor.b(), aZp, bZp}, zeroTensor); + + return success(); + } +}; + +class FullyConnectedConverter + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(tosa::FullyConnectedOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + Location loc = op.getLoc(); + auto outputTy = op.getType().cast(); + auto input = op.input(); + auto inputTy = input.getType().cast(); + + auto bias = op.bias(); + + auto weight = op.weight(); + auto weightTy = weight.getType().cast(); + auto weightShape = weightTy.getShape(); + + auto outputETy = outputTy.getElementType(); + + SmallVector dynDims; + dynDims.resize(op->getResult(0).getType().cast().getRank()); + + if (!inputTy.hasRank() || inputTy.isDynamicDim(0)) { + dynDims[0] = rewriter.create(loc, input, 0); + } + + if (!weightTy.hasRank() || weightTy.isDynamicDim(0)) { + dynDims[1] = rewriter.create(loc, weight, 0); + } + + SmallVector filteredDims = filterDynamicDims(dynDims); + + // Creating maps for the output of MatMul and the bias + SmallVector indexingMaps; + + // Broadcast the bias. + indexingMaps.push_back(AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0, + {rewriter.getAffineDimExpr(1)}, + rewriter.getContext())); + + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank())); + + auto initTensor = rewriter.create( + loc, filteredDims, outputTy.getShape(), outputTy.getElementType()); + + // When quantized, the input elemeny type is not the same as the output + Attribute resultZeroAttr = rewriter.getZeroAttr(outputETy); + Value zero = rewriter.create(loc, resultZeroAttr); + Value zeroTensor = + rewriter.create(loc, zero, initTensor).getResult(0); + + SmallVector permutation{1, 0}; + auto permutationAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), permutation); + Value permutationValue = + rewriter.create(loc, permutationAttr); + + SmallVector newWeightShape{weightShape[1], weightShape[0]}; + Type newWeightTy = + RankedTensorType::get(newWeightShape, weightTy.getElementType()); + + Value transposedWeight = rewriter.create( + loc, newWeightTy, weight, permutationValue); + + auto biasInitTensor = + rewriter + .create(loc, filteredDims, + outputTy.getShape(), outputETy) + ->getResults(); + + if (!op.quantization_info()) { + Value matmul = rewriter + .create( + loc, TypeRange{op.getType()}, + ValueRange{input, transposedWeight}, zeroTensor) + ->getResult(0); + + Value result = + rewriter + .create( + loc, outputTy, ValueRange({bias, matmul}), biasInitTensor, + indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + Value added = nestedBuilder.create( + loc, args[0], args[1]); + nestedBuilder.create(nestedLoc, added); + }) + .getResult(0); + rewriter.replaceOp(op, result); + return success(); + } + + auto quantizationInfo = op.quantization_info().getValue(); + auto inputZp = rewriter.create( + loc, rewriter.getI32IntegerAttr( + quantizationInfo.input_zp().getValue().getSExtValue())); + auto outputZp = rewriter.create( + loc, rewriter.getI32IntegerAttr( + quantizationInfo.weight_zp().getValue().getSExtValue())); + Value matmul = + rewriter + .create( + loc, TypeRange{op.getType()}, + ValueRange{input, transposedWeight, inputZp, outputZp}, + zeroTensor) + ->getResult(0); + Value result = + rewriter + .create( + loc, outputTy, ValueRange({bias, matmul}), biasInitTensor, + indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + Value added = nestedBuilder.create( + loc, args[0], args[1]); + nestedBuilder.create(nestedLoc, added); + }) + .getResult(0); + rewriter.replaceOp(op, result); + return success(); + } +}; + +class MaxPool2dConverter : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::MaxPool2dOp op, + PatternRewriter &rewriter) const final { + Location loc = op.getLoc(); + Value input = op.input(); + ShapedType inputTy = input.getType().cast(); + + ShapedType resultTy = op.getType().template cast(); + Type resultETy = inputTy.getElementType(); + + if (!inputTy.hasStaticShape()) + return failure(); + + // Determine what the initial value needs to be for the max pool op. + Attribute initialAttr; + if (resultETy.isF32()) + initialAttr = rewriter.getFloatAttr( + resultETy, + APFloat::getLargest(resultETy.cast().getFloatSemantics(), + true)); + + if (resultETy.isa()) + initialAttr = rewriter.getIntegerAttr( + resultETy, + APInt::getSignedMinValue(resultETy.getIntOrFloatBitWidth())); + + if (!initialAttr) + return rewriter.notifyMatchFailure( + op, "Unsupported initial value for tosa.maxpool_2d op"); + + // Apply padding as necessary. + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(op.pad(), pad); + pad.resize(pad.size() + 2, 0); + Value paddedInput = applyPad(loc, input, pad, initialAttr, rewriter); + + Value initialValue = rewriter.create(loc, initialAttr); + + SmallVector kernel, stride; + getValuesFromIntArrayAttribute(op.kernel(), kernel); + getValuesFromIntArrayAttribute(op.stride(), stride); + + Attribute strideAttr = rewriter.getI64VectorAttr(stride); + Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1}); + + // Create the linalg op that performs pooling. + Value initTensor = rewriter.create( + loc, resultTy.getShape(), resultTy.getElementType()); + + Value filledInitTensor = + rewriter.create(loc, initialValue, initTensor).result(); + + Value fakeWindowDims = + rewriter.create(loc, kernel, resultETy); + + rewriter.replaceOpWithNewOp( + op, ArrayRef{resultTy}, ValueRange{paddedInput, fakeWindowDims}, + filledInitTensor, strideAttr, dilationAttr); + return success(); + } +}; + +class AvgPool2dConverter : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::AvgPool2dOp op, + PatternRewriter &rewriter) const final { + Location loc = op.getLoc(); + Value input = op.input(); + ShapedType inputTy = input.getType().cast(); + Type inElementTy = inputTy.getElementType(); + + ShapedType resultTy = op.getType().template cast(); + Type resultETy = op.getType().cast().getElementType(); + + Type accETy = + inElementTy.isa() ? rewriter.getI32Type() : inElementTy; + ShapedType accTy = resultTy.clone(accETy); + + if (!inputTy.hasStaticShape()) + return failure(); + + // Apply padding as necessary. + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(op.pad(), pad); + pad.resize(pad.size() + 2, 0); + Attribute padAttr = rewriter.getZeroAttr(inElementTy); + Value paddedInput = applyPad(loc, input, pad, padAttr, rewriter); + + Attribute initialAttr = rewriter.getZeroAttr(accETy); + Value initialValue = rewriter.create(loc, initialAttr); + + SmallVector kernel, stride; + getValuesFromIntArrayAttribute(op.kernel(), kernel); + getValuesFromIntArrayAttribute(op.stride(), stride); + + Attribute strideAttr = rewriter.getI64VectorAttr(stride); + Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1}); + + // Create the linalg op that performs pooling. + Value poolInitTensor = + rewriter.create(loc, accTy.getShape(), accETy); + + Value filledInitTensor = + rewriter.create(loc, initialValue, poolInitTensor) + .result(); + + Value fakeWindowDims = + rewriter.create(loc, kernel, accETy); + + // Sum across the pooled region. + Value poolingOp = rewriter + .create( + loc, ArrayRef{accTy}, + ValueRange{paddedInput, fakeWindowDims}, + filledInitTensor, strideAttr, dilationAttr) + .getResult(0); + + // Normalize the summed value by the number of elements grouped in each + // pool. + auto poolingOpTy = poolingOp.getType().cast(); + auto affineMap = rewriter.getMultiDimIdentityMap(resultTy.getRank()); + + Value genericInitTensor = rewriter.create( + loc, resultTy.getShape(), resultETy); + + auto genericOp = rewriter.create( + loc, ArrayRef({resultTy}), ValueRange{poolingOp}, + ValueRange{genericInitTensor}, + ArrayRef({affineMap, affineMap}), + getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &b, Location loc, ValueRange args) { + auto zero = rewriter.create(loc, 0); + auto one = rewriter.create(loc, 1); + auto iH = rewriter.create( + loc, poolingOpTy.getDimSize(1) - 1); + auto iW = rewriter.create( + loc, poolingOpTy.getDimSize(2) - 1); + + // Compute the indices from either end. + auto y0 = rewriter.create(loc, 1); + auto x0 = rewriter.create(loc, 2); + auto y1 = rewriter.create(loc, iH, y0); + auto x1 = rewriter.create(loc, iW, x0); + + // Determines what the portion of valid input is covered by the + // kernel. + auto padFn = [&](Value v, Value x, int64_t pad) -> Value { + if (pad == 0) + return v; + + auto padVal = rewriter.create(loc, pad); + Value dx = rewriter.create(loc, x, padVal); + + Value cmp = rewriter.create( + loc, arith::CmpIPredicate::slt, dx, zero); + Value offset = rewriter.create(loc, cmp, dx, zero); + return rewriter.create(loc, v, offset)->getResult(0); + }; + + // Compute the vertical component of coverage. + auto kH0 = rewriter.create(loc, kernel[0]); + auto kH1 = padFn(kH0, y0, pad[2]); + auto kH2 = padFn(kH1, y1, pad[3]); + auto kHCmp = rewriter.create( + loc, arith::CmpIPredicate::slt, kH2, one); + auto kH3 = rewriter.create(loc, kHCmp, one, kH2); + + // compute the horizontal component of coverage. + auto kW0 = rewriter.create(loc, kernel[1]); + auto kW1 = padFn(kW0, x0, pad[4]); + auto kW2 = padFn(kW1, x1, pad[5]); + auto kWCmp = rewriter.create( + loc, arith::CmpIPredicate::slt, kW2, one); + auto kW3 = rewriter.create(loc, kWCmp, one, kW2); + + // Compute the total number of elements and normalize. + Value count = rewriter.create(loc, kH3, kW3); + auto countI = rewriter.create( + loc, rewriter.getI32Type(), count); + + // Divide by the number of summed values. For floats this is just + // a div however for quantized values input normalization had + // to be applied. + Value poolVal = args[0]; + if (accETy.isa()) { + auto countF = rewriter.create(loc, accETy, countI); + poolVal = rewriter.create(loc, poolVal, countF) + ->getResult(0); + } else { + + // If we have quantization information we need to apply an offset + // for the input zp value. + if (op.quantization_info()) { + auto quantizationInfo = op.quantization_info().getValue(); + auto inputZp = rewriter.create( + loc, quantizationInfo.input_zp()); + Value offset = + rewriter.create(loc, accETy, countI, inputZp); + poolVal = + rewriter.create(loc, accETy, poolVal, offset); + } + + // Compute the multiplier and shift values for the quantization + // normalization. Preferably we would want to compute more bits + // however 32-bits should be enough for compute. Honestly we + // should probably straight divide. + int64_t numerator = ((1 << 30) + 1); + int64_t shift = 30; + + Value numeratorVal = rewriter.create( + loc, rewriter.getI32IntegerAttr(numerator)); + Value multiplierVal = + rewriter + .create(loc, rewriter.getI32Type(), + numeratorVal, countI) + .getResult(); + Value shiftVal = rewriter.create( + loc, rewriter.getI8IntegerAttr(shift)); + + auto scaled = + rewriter + .create( + loc, rewriter.getI32Type(), poolVal, multiplierVal, + shiftVal, rewriter.getBoolAttr(false)) + .getResult(); + + // If we have quantization information we need to apply output + // zeropoint. + if (op.quantization_info()) { + auto quantizationInfo = op.quantization_info().getValue(); + auto outputZp = rewriter.create( + loc, quantizationInfo.output_zp()); + scaled = rewriter.create(loc, scaled, outputZp) + .getResult(); + } + + // Apply Clip. + int64_t outBitwidth = resultETy.getIntOrFloatBitWidth(); + + auto min = rewriter.create( + loc, APInt::getSignedMinValue(outBitwidth).getSExtValue(), + accETy); + auto max = rewriter.create( + loc, APInt::getSignedMaxValue(outBitwidth).getSExtValue(), + accETy); + auto clamp = clampHelper( + loc, scaled, min, max, arith::CmpIPredicate::slt, rewriter); + + poolVal = clamp; + // Convert type. + if (resultETy != clamp.getType()) { + poolVal = + rewriter.create(loc, resultETy, poolVal); + } + } + + rewriter.create(loc, poolVal); + }); + + rewriter.replaceOp(op, genericOp.getResult(0)); + return success(); + } +}; + +} // namespace + +void mlir::tosa::populateTosaToLinalgNamedConversionPatterns( + RewritePatternSet *patterns) { + patterns->add< + // clang-format off + ConvConverter, + DepthwiseConvConverter, + MatMulConverter, + MaxPool2dConverter, + AvgPool2dConverter, + FullyConnectedConverter>(patterns->getContext()); + // clang-format on +} diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp copy from mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp copy to mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp @@ -30,7 +30,7 @@ using namespace mlir; namespace { -struct TosaToLinalg : public TosaToLinalgBase { +struct TosaToLinalgNamed : public TosaToLinalgNamedBase { public: void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); - target.addIllegalDialect(); + tosa::TosaDialect, tensor::TensorDialect, + scf::SCFDialect>(); // Not every TOSA op can be legalized to linalg. - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); FuncOp func = getFunction(); - mlir::tosa::populateTosaToLinalgConversionPatterns(&patterns); + mlir::tosa::populateTosaToLinalgNamedConversionPatterns(&patterns); if (failed(applyFullConversion(func, target, std::move(patterns)))) signalPassFailure(); } }; } // namespace -std::unique_ptr mlir::tosa::createTosaToLinalg() { - return std::make_unique(); -} - -void mlir::tosa::addTosaToLinalgPasses(OpPassManager &pm) { - pm.addNestedPass(createTosaMakeBroadcastablePass()); - pm.addNestedPass(createTosaToLinalg()); +std::unique_ptr mlir::tosa::createTosaToLinalgNamed() { + return std::make_unique(); } diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp @@ -26,6 +26,7 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/Passes.h" using namespace mlir; @@ -67,6 +68,9 @@ } void mlir::tosa::addTosaToLinalgPasses(OpPassManager &pm) { + pm.addNestedPass(createTosaMakeBroadcastablePass()); + pm.addNestedPass(createTosaToLinalgNamed()); + pm.addNestedPass(mlir::createCanonicalizerPass()); pm.addNestedPass(createTosaMakeBroadcastablePass()); pm.addNestedPass(createTosaToLinalg()); } diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir @@ -0,0 +1,448 @@ +// RUN: mlir-opt --split-input-file --tosa-to-linalg-named %s -verify-diagnostics -o -| FileCheck %s + +// CHECK-LABEL: @matmul +func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) { + // CHECK: [[C0:%.+]] = arith.constant 0 + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6] + // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32> + // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32> + %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) + return %0 : tensor<1x5x6xf32> +} + +// ----- + + +// CHECK-LABEL: @matmul_quantized +func @matmul_quantized(%arg0: tensor<1x5x3xi8>, %arg1: tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) { + // CHECK: [[C0:%.+]] = arith.constant 0 + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6] + // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : i32, tensor<1x5x6xi32> -> tensor<1x5x6xi32> + // CHECK: [[ONE:%.+]] = arith.constant 1 + // CHECK: [[TWO:%.+]] = arith.constant 2 + // CHECK: linalg.quantized_batch_matmul ins(%arg0, %arg1, [[ONE]], [[TWO]] : tensor<1x5x3xi8>, tensor<1x3x6xi8>, i32, i32) outs([[FILLED]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32> + %0 = "tosa.matmul"(%arg0, %arg1) {quantization_info = {a_zp = 1 : i32, b_zp = 2 : i32}} : (tensor<1x5x3xi8>, tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) + return %0 : tensor<1x5x6xi32> +} + +// ----- + +// CHECK-LABEL: @matmul_dyn_batch +func @matmul_dyn_batch(%arg0: tensor, %arg1: tensor) -> (tensor) { + // CHECK: %[[C0:.+]] = arith.constant 0 + // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]] + // CHECK: %[[C0_0:.+]] = arith.constant 0 + // CHECK: %[[INIT:.+]] = linalg.init_tensor [%[[DIM]], 5, 6] + // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0_0]], %[[INIT]]) : f32, tensor -> tensor + // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor, tensor) outs(%[[FILLED]] : tensor) -> tensor + %0 = "tosa.matmul"(%arg0, %arg1) : (tensor, tensor) -> (tensor) + return %0 : tensor +} + +// ----- + +// CHECK-LABEL: @matmul_dyn_independent_dim +func @matmul_dyn_independent_dim(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x?xf32>) -> (tensor<1x5x?xf32>) { + // CHECK: %[[C2:.+]] = arith.constant 2 + // CHECK: %[[DIM:.+]] = tensor.dim %arg1, %[[C2]] + // CHECK: %[[C0:.+]] = arith.constant 0 + // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, %[[DIM]]] + // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x?xf32> -> tensor<1x5x?xf32> + // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x?xf32>) outs(%[[FILLED]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32> + %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x?xf32>) -> (tensor<1x5x?xf32>) + return %0 : tensor<1x5x?xf32> +} + +// ----- + +// CHECK-LABEL: @matmul_dyn_independent_dim +func @matmul_dyn_independent_dim(%arg0: tensor<1x5x?xf32>, %arg1: tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) { + // CHECK: %[[C0:.+]] = arith.constant 0 + // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, 6] + // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32> + // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x?xf32>, tensor<1x?x6xf32>) outs(%[[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32> + %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x?xf32>, tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) + return %0 : tensor<1x5x6xf32> +} + +// ----- + +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)> +// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)> + +// CHECK-LABEL: @fully_connected +func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) { + // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6] + // CHECK: [[ZERO:%.+]] = arith.constant 0 + // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]]) + // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]> + // CHECK: [[TRANSPOSE:%.+]] = "tosa.transpose"(%arg1, [[PERM]]) + // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6] + // CHECK: [[MATMUL:%.+]] = linalg.matmul ins(%arg0, [[TRANSPOSE]] : tensor<5x3xf32>, tensor<3x6xf32>) outs([[FILL]] : tensor<5x6xf32>) -> tensor<5x6xf32> + // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xf32>, tensor<5x6xf32>) outs([[INITB]] : tensor<5x6xf32>) { + // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): + // CHECK: [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32 + // CHECK: linalg.yield [[ADD]] : f32 + + %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor<5x3xf32>, tensor<6x3xf32>, tensor<6xf32>) -> (tensor<5x6xf32>) + return %0 : tensor<5x6xf32> +} + +// ----- + +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)> +// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)> + +// CHECK-LABEL: @quantized_fully_connected +func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) { + // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6] + // CHECK: [[ZERO:%.+]] = arith.constant 0 + // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]]) + // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]> + // CHECK: [[TRANSPOSE:%.+]] = "tosa.transpose"(%arg1, [[PERM]]) + // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6] + // CHECK: [[ONE:%.+]] = arith.constant 1 + // CHECK: [[TWO:%.+]] = arith.constant 2 + // CHECK: [[MATMUL:%.+]] = linalg.quantized_matmul ins(%arg0, [[TRANSPOSE]], [[ONE]], [[TWO]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs([[FILL]] : tensor<5x6xi32>) -> tensor<5x6xi32> + // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xi32>, tensor<5x6xi32>) outs([[INITB]] + // CHECK: ^bb0([[IN1:%.+]]: i32, [[IN2:%.+]]: i32, [[UNUSED:%.+]]: i32): + // CHECK: [[ADD:%.+]] = arith.addi + // CHECK: linalg.yield [[ADD]] : i32 + %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) {quantization_info = {input_zp = 1:i32, weight_zp = 2:i32}} : (tensor<5x3xi8>, tensor<6x3xi8>, tensor<6xi32>) -> (tensor<5x6xi32>) + return %0 : tensor<5x6xi32> +} + +// ----- + +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)> +// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)> + +// CHECK-LABEL: @fully_connected_dyn +func @fully_connected_dyn(%arg0: tensor, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor) { + // CHECK: %[[C0:.+]] = arith.constant 0 + // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]] + // CHECK: %[[INITT:.+]] = linalg.init_tensor [%[[DIM]], 6] + // CHECK: %[[ZERO:.+]] = arith.constant 0 + // CHECK: %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INITT]]) + // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> + // CHECK: %[[TRANSPOSE:.+]] = "tosa.transpose"(%arg1, %[[PERM]]) + // CHECK: %[[INITB:.+]] = linalg.init_tensor [%[[DIM]], 6] + // CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%arg0, %[[TRANSPOSE]] : tensor, tensor<3x6xf32>) outs(%[[FILL]] : tensor) -> tensor + // CHECK: %[[ADDED:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, %[[MATMUL]] : tensor<6xf32>, tensor) outs(%[[INITB]] : tensor) { + // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): + // CHECK: %[[ADD:.+]] = arith.addf %arg3, %arg4 : f32 + // CHECK: linalg.yield %[[ADD]] : f32 + + %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor, tensor<6x3xf32>, tensor<6xf32>) -> (tensor) + return %0 : tensor +} + +// ----- + +// CHECK-LABEL: @max_pool +func @max_pool(%arg0: tensor<1x6x34x62xf32>) -> () { + // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 + // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 32, 62] + // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[CONST]], [[INIT]]) + // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3] + // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, [[KERNEL]] : tensor<1x6x34x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x32x62xf32>) + %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>) -> (tensor<1x4x32x62xf32>) + return +} + +// CHECK-LABEL: @max_pool_padded +func @max_pool_padded(%arg0: tensor<1x6x34x62xf32>) -> () { + // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 : f32 + // CHECK-DAG: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0] + // CHECK-DAG: linalg.yield [[CONST]] + // CHECK-DAG: [[INITVAL:%.+]] = arith.constant -3.40282347E+38 : f32 + // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 33, 62] + // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INITVAL]], [[INIT]]) + // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3] + // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x6x35x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x33x62xf32>) + %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 1], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>) -> (tensor<1x4x33x62xf32>) + return +} + +// CHECK-LABEL: @max_pool_i8 +func @max_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> () { + // CHECK: arith.constant -128 + // CHECK: linalg.pooling_nhwc_max + %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi8>) -> (tensor<1x4x32x62xi8>) + return +} + +// CHECK-LABEL: @max_pool_i16 +func @max_pool_i16(%arg0: tensor<1x6x34x62xi16>) -> () { + // CHECK: arith.constant -32768 + // CHECK: linalg.pooling_nhwc_max + %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi16>) -> (tensor<1x4x32x62xi16>) + return +} + +// CHECK-LABEL: @max_pool_i32 +func @max_pool_i32(%arg0: tensor<1x6x34x62xi32>) -> () { + // CHECK: arith.constant -2147483648 + // CHECK: linalg.pooling_nhwc_max + %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi32>) -> (tensor<1x4x32x62xi32>) + return +} +// ----- + +// CHECK-LABEL: @avg_pool +func @avg_pool(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) { + // Initial piece computes the sum of the pooling region, with appropriate padding. + // CHECK: [[CONST:%.+]] = arith.constant 0 + // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] + // CHECK: [[CONST:%.+]] = arith.constant 0 + // CHECK: [[POOLINIT:%.+]] = linalg.init_tensor [1, 5, 33, 62] + // CHECK: [[FILL:%.+]] = linalg.fill([[CONST]], [[POOLINIT]]) + // CHECK: [[KERNEL:%.+]] = linalg.init_tensor [4, 4] + // CHECK: [[POOL:%.+]] = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x8x36x62xf32>, tensor<4x4xf32>) outs([[FILL]] : tensor<1x5x33x62xf32>) + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 33, 62] + // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[POOL]] : tensor<1x5x33x62xf32>) outs([[INIT]] : tensor<1x5x33x62xf32>) + // CHECK: [[ZERO:%.0]] = arith.constant 0 + // CHECK: [[ONE:%.+]] = arith.constant 1 + // CHECK: [[HEIGHT:%.+]] = arith.constant 4 + // CHECK: [[WIDTH:%.+]] = arith.constant 32 + // CHECK: [[IDX1:%.+]] = linalg.index 1 + // CHECK: [[IDX2:%.+]] = linalg.index 2 + + // The large block below computes what portion of the kernel is within non-padded input. + // CHECK: [[NY:%.+]] = arith.subi [[HEIGHT]], [[IDX1]] + // CHECK: [[NX:%.+]] = arith.subi [[WIDTH]], [[IDX2]] + // CHECK: [[KH:%.+]] = arith.constant 4 + // CHECK: [[PAD0:%.+]] = arith.constant 1 + // CHECK: [[SUBP0:%.+]] = arith.subi [[IDX1]], [[PAD0]] + // CHECK: [[P0CMP:%.+]] = arith.cmpi slt, [[SUBP0]], [[ZERO]] + // CHECK: [[SELP0:%.+]] = select [[P0CMP]], [[SUBP0]], [[ZERO]] + // CHECK: [[ADDP0:%.+]] = arith.addi [[KH]], [[SELP0]] + // CHECK: [[PAD1:%.+]] = arith.constant 1 + // CHECK: [[SUBP1:%.+]] = arith.subi [[NY]], [[PAD1]] + // CHECK: [[P1CMP:%.+]] = arith.cmpi slt, [[SUBP1]], [[ZERO]] + // CHECK: [[SELP1:%.+]] = select [[P1CMP]], [[SUBP1]], [[ZERO]] + // CHECK: [[ADDP1:%.+]] = arith.addi [[ADDP0]], [[SELP1]] + // CHECK: [[YCMP:%.+]] = arith.cmpi slt, [[ADDP1]], [[ONE]] + // CHECK: [[YSEL:%.+]] = select [[YCMP]], [[ONE]], [[ADDP1]] + // CHECK: [[KW:%.+]] = arith.constant 4 : index + // CHECK: [[PAD2:%.+]] = arith.constant 1 : index + // CHECK: [[SUBP2:%.+]] = arith.subi [[IDX2]], [[PAD2]] + // CHECK: [[P2CMP:%.+]] = arith.cmpi slt, [[SUBP2]], [[ZERO]] + // CHECK: [[SELP2:%.+]] = select [[P2CMP]], [[SUBP2]], [[ZERO]] + // CHECK: [[ADDP2:%.+]] = arith.addi [[KW]], [[SELP2]] + // CHECK: [[PAD3:%.+]] = arith.constant 1 : index + // CHECK: [[SUBP3:%.+]] = arith.subi [[NX]], [[PAD3]] + // CHECK: [[P3CMP:%.+]] = arith.cmpi slt, [[SUBP3]], [[ZERO]] + // CHECK: [[SELP3:%.+]] = select [[P3CMP]], [[SUBP3]], [[ZERO]] + // CHECK: [[ADDP3:%.+]] = arith.addi [[ADDP2]], [[SELP3]] + // CHECK: [[XCMP:%.+]] = arith.cmpi slt, [[ADDP3]], [[ONE]] + // CHECK: [[XSEL:%.+]] = select [[XCMP]], [[ONE]], [[ADDP3]] + + // Given the valid coverage of the pooling region, normalize the summation. + // CHECK: [[C:%.+]] = arith.muli [[YSEL]], [[XSEL]] + // CHECK: [[CI:%.+]] = arith.index_cast [[C]] + // CHECK: [[CF:%.+]] = arith.sitofp [[CI]] + // CHECK: [[RESULT:%.+]] = arith.divf %arg1, [[CF]] + // CHECK: linalg.yield [[RESULT]] + %0 = "tosa.avg_pool2d"(%arg0) {pad = [1, 1, 1, 1], kernel = [4, 4], stride = [1, 1]} : (tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) + return %0 : tensor<1x5x33x62xf32> +} + +// ----- + +// CHECK-LABEL: @avg_pool_i8 +func @avg_pool_i8(%arg0 : tensor<1x128x128x2xi8>) -> () { + + // CHECK: linalg.pooling_nhwc_sum + // CHECK: linalg.generic + + // CHECK: %[[INZP:.+]] = arith.constant -128 + // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]] + // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]] + // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825 + // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}} + // CHECK: %[[SHIFT:.+]] = arith.constant 30 + // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false} + // CHECK: %[[OUTZP:.+]] = arith.constant -128 + // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]] + // CHECK: %[[MIN:.+]] = arith.constant -128 + // CHECK: %[[MAX:.+]] = arith.constant 127 + // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]] + // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]] + // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]] + // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]] + // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]] + // CHECK: linalg.yield %[[TRUNC]] + %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi8>) -> tensor<1x32x32x2xi8> + return +} + +// ----- + +// CHECK-LABEL: @avg_pool_i16 +func @avg_pool_i16(%arg0 : tensor<1x128x128x2xi16>) -> () { + + // CHECK: linalg.pooling_nhwc_sum + // CHECK: linalg.generic + + // CHECK: %[[INZP:.+]] = arith.constant -128 + // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]] + // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]] + // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825 + // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}} + // CHECK: %[[SHIFT:.+]] = arith.constant 30 + // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false} + // CHECK: %[[OUTZP:.+]] = arith.constant -128 + // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]] + // CHECK: %[[MIN:.+]] = arith.constant -32768 + // CHECK: %[[MAX:.+]] = arith.constant 32767 + // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]] + // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]] + // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]] + // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]] + // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]] + // CHECK: linalg.yield %[[TRUNC]] + %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi16>) -> tensor<1x32x32x2xi16> + return +} + +// ----- + +// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK-LABEL: @conv2d_f32 +func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () { + // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 2, 3, 0]> + // CHECK: %[[W:.+]] = "tosa.transpose"(%arg1, %[[PERM]]) + // CHECK: %[[M_IN:.+]] = linalg.init_tensor [1, 45, 40, 28] + // CHECK: %[[CST:.+]] = arith.constant 0 + // CHECK: %[[FILL:.+]] = linalg.fill + // CHECK: %[[B_IN:.+]] = linalg.init_tensor [1, 45, 40, 28] + // CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[W]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%[[FILL]] : tensor<1x45x40x28xf32>) + // CHECK: %[[B:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<28xf32>, tensor<1x45x40x28xf32>) outs(%[[B_IN]] : tensor<1x45x40x28xf32>) + // CHECK: arith.addf + // CHECK: linalg.yield + %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [0, 0, 0, 0], stride = [1, 1], dilation = [2, 1]} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> (tensor<1x45x40x28xf32>) + return +} + +// ----- + +// CHECK-LABEL: @conv2d_padded_f32 +func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () { + // CHECK: %[[C0:.+]] = arith.constant 0 + // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] + // CHECK: linalg.yield %[[C0]] + // CHECK: linalg.conv_2d_nhwc_hwcf + %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [1, 1, 1, 1], stride = [1, 1], dilation = [2, 1]} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> (tensor<1x45x40x28xf32>) + return +} + +// ----- + +// CHECK-LABEL: @conv2d_quant +func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1xi8>, %arg2 : tensor<1024xi32>) -> () { + // CHECK: %[[C22:.+]] = arith.constant -22 + // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] + // CHECK: linalg.yield %[[C22]] + // CHECK: linalg.conv_2d_nhwc_hwcf_q + %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = -22 : i32, weight_zp = 42 : i32}, stride = [1, 1]} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32> + return +} + +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK-LABEL: @depthwise_conv +func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () { + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11] + // CHECK: [[CST0:%.+]] = arith.constant 0 + // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) + // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33] + // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>) + // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 5, 5, 33]} + // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) { + // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors + // CHECK: [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32 + // CHECK: linalg.yield [[ADD]] : f32 + // CHECK: } -> tensor<1x5x5x33xf32> + %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [1, 1], dilation = [1, 1] } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>) -> (tensor<1x5x5x33xf32>) + return +} + +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK-LABEL: @depthwise_conv_strides +func @depthwise_conv_strides(%arg0 : tensor<1x11x9x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () { + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11] + // CHECK: [[CST0:%.+]] = arith.constant 0 + // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) + // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33] + // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>) + // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 5, 5, 33]} + // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) { + // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors + // CHECK: [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32 + // CHECK: linalg.yield [[ADD]] : f32 + // CHECK: } -> tensor<1x5x5x33xf32> + %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [2, 2], dilation = [1, 1] } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>) -> (tensor<1x5x5x33xf32>) + return +} + +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK-LABEL: @depthwise_conv_quant +func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () { + // CHECK: [[PADV:%.+]] = arith.constant -128 + // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] + // CHECK: linalg.yield [[PADV]] + + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 12, 12, 4, 128] + // CHECK: [[CST0:%.+]] = arith.constant 0 + // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) + // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 12, 12, 512] + // CHECK: [[C128:%.+]] = arith.constant -128 + // CHECK: [[C42:%.+]] = arith.constant 42 + // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x12x12x4x128xi32>) + // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 12, 12, 512]} + // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x12x12x512xi32>) outs([[OUT]] : tensor<1x12x12x512xi32>) { + // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32): // no predecessors + // CHECK: [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32 + // CHECK: linalg.yield [[ADD]] : i32 + // CHECK: } -> tensor<1x12x12x512xi32> + %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [1, 1, 1, 1], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [1, 1] } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x12x12x512xi32> + return +} + +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK-LABEL: @depthwise_conv_quant_dilations +func @depthwise_conv_quant_dilations(%arg0 : tensor<1x14x14x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () { + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 10, 10, 4, 128] + // CHECK: [[CST0:%.+]] = arith.constant 0 + // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) + // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 10, 10, 512] + // CHECK: [[C128:%.+]] = arith.constant -128 + // CHECK: [[C42:%.+]] = arith.constant 42 + // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x10x10x4x128xi32>) + // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 10, 10, 512]} + // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x10x10x512xi32>) outs([[OUT]] : tensor<1x10x10x512xi32>) { + // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32): // no predecessors + // CHECK: [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32 + // CHECK: linalg.yield [[ADD]] : i32 + // CHECK: } -> tensor<1x10x10x512xi32> + %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [2, 2] } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x10x10x512xi32> + return +} diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -1064,154 +1064,6 @@ // ----- - -// CHECK-LABEL: @matmul -func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) { - // CHECK: [[C0:%.+]] = arith.constant 0 - // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6] - // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32> - // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32> - %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) - return %0 : tensor<1x5x6xf32> -} - -// ----- - - -// CHECK-LABEL: @matmul_quantized -func @matmul_quantized(%arg0: tensor<1x5x3xi8>, %arg1: tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) { - // CHECK: [[C0:%.+]] = arith.constant 0 - // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6] - // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : i32, tensor<1x5x6xi32> -> tensor<1x5x6xi32> - // CHECK: [[ONE:%.+]] = arith.constant 1 - // CHECK: [[TWO:%.+]] = arith.constant 2 - // CHECK: linalg.quantized_batch_matmul ins(%arg0, %arg1, [[ONE]], [[TWO]] : tensor<1x5x3xi8>, tensor<1x3x6xi8>, i32, i32) outs([[FILLED]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32> - %0 = "tosa.matmul"(%arg0, %arg1) {quantization_info = {a_zp = 1 : i32, b_zp = 2 : i32}} : (tensor<1x5x3xi8>, tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) - return %0 : tensor<1x5x6xi32> -} - -// ----- - -// CHECK-LABEL: @matmul_dyn_batch -func @matmul_dyn_batch(%arg0: tensor, %arg1: tensor) -> (tensor) { - // CHECK: %[[C0:.+]] = arith.constant 0 - // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]] - // CHECK: %[[C0_0:.+]] = arith.constant 0 - // CHECK: %[[INIT:.+]] = linalg.init_tensor [%[[DIM]], 5, 6] - // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0_0]], %[[INIT]]) : f32, tensor -> tensor - // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor, tensor) outs(%[[FILLED]] : tensor) -> tensor - %0 = "tosa.matmul"(%arg0, %arg1) : (tensor, tensor) -> (tensor) - return %0 : tensor -} - -// ----- - -// CHECK-LABEL: @matmul_dyn_independent_dim -func @matmul_dyn_independent_dim(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x?xf32>) -> (tensor<1x5x?xf32>) { - // CHECK: %[[C2:.+]] = arith.constant 2 - // CHECK: %[[DIM:.+]] = tensor.dim %arg1, %[[C2]] - // CHECK: %[[C0:.+]] = arith.constant 0 - // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, %[[DIM]]] - // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x?xf32> -> tensor<1x5x?xf32> - // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x?xf32>) outs(%[[FILLED]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32> - %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x?xf32>) -> (tensor<1x5x?xf32>) - return %0 : tensor<1x5x?xf32> -} - -// ----- - -// CHECK-LABEL: @matmul_dyn_independent_dim -func @matmul_dyn_independent_dim(%arg0: tensor<1x5x?xf32>, %arg1: tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) { - // CHECK: %[[C0:.+]] = arith.constant 0 - // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, 6] - // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32> - // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x?xf32>, tensor<1x?x6xf32>) outs(%[[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32> - %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x?xf32>, tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) - return %0 : tensor<1x5x6xf32> -} - -// ----- - -// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)> -// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)> -// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d1)> - -// CHECK-LABEL: @fully_connected -func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) { - // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6] - // CHECK: [[ZERO:%.+]] = arith.constant 0 - // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]]) - // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]> - // CHECK: [[INITT:%.+]] = linalg.init_tensor [3, 6] - // CHECK: [[TRANSPOSE:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<6x3xf32>) outs([[INITT]] : tensor<3x6xf32>) { - // CHECK: ^bb0([[IN:%.+]]: f32, [[UNUSED:%.+]]: f32): - // CHECK: linalg.yield [[IN]] : f32 - // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6] - // CHECK: [[MATMUL:%.+]] = linalg.matmul ins(%arg0, [[TRANSPOSE]] : tensor<5x3xf32>, tensor<3x6xf32>) outs([[FILL]] : tensor<5x6xf32>) -> tensor<5x6xf32> - // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xf32>, tensor<5x6xf32>) outs([[INITB]] : tensor<5x6xf32>) { - // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): - // CHECK: [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32 - // CHECK: linalg.yield [[ADD]] : f32 - - %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor<5x3xf32>, tensor<6x3xf32>, tensor<6xf32>) -> (tensor<5x6xf32>) - return %0 : tensor<5x6xf32> -} - -// ----- - -// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)> -// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)> -// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d1)> - -// CHECK-LABEL: @quantized_fully_connected -func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) { - // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6] - // CHECK: [[ZERO:%.+]] = arith.constant 0 - // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]]) - // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]> - // CHECK: [[INITT:%.+]] = linalg.init_tensor [3, 6] - // CHECK: [[TRANSPOSE:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<6x3xi8>) outs([[INITT]] : tensor<3x6xi8>) { - // CHECK: ^bb0([[IN:%.+]]: i8, [[UNUSED:%.+]]: i8): - // CHECK: linalg.yield [[IN]] : i8 - // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6] - // CHECK: [[ONE:%.+]] = arith.constant 1 - // CHECK: [[TWO:%.+]] = arith.constant 2 - // CHECK: [[MATMUL:%.+]] = linalg.quantized_matmul ins(%arg0, [[TRANSPOSE]], [[ONE]], [[TWO]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs([[FILL]] : tensor<5x6xi32>) -> tensor<5x6xi32> - // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xi32>, tensor<5x6xi32>) outs([[INITB]] - // CHECK: ^bb0([[IN1:%.+]]: i32, [[IN2:%.+]]: i32, [[UNUSED:%.+]]: i32): - // CHECK: [[ADD:%.+]] = arith.addi - // CHECK: linalg.yield [[ADD]] : i32 - %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) {quantization_info = {input_zp = 1:i32, weight_zp = 2:i32}} : (tensor<5x3xi8>, tensor<6x3xi8>, tensor<6xi32>) -> (tensor<5x6xi32>) - return %0 : tensor<5x6xi32> -} - -// ----- - -// CHECK-LABEL: @fully_connected_dyn -func @fully_connected_dyn(%arg0: tensor, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor) { - // CHECK: %[[C0:.+]] = arith.constant 0 - // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]] - // CHECK: %[[INITT:.+]] = linalg.init_tensor [%[[DIM]], 6] - // CHECK: %[[ZERO:.+]] = arith.constant 0 - // CHECK: %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INITT]]) - // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> - // CHECK: %[[INITT:.+]] = linalg.init_tensor [3, 6] - // CHECK: %[[TRANSPOSE:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<6x3xf32>) outs(%[[INITT]] : tensor<3x6xf32>) { - // CHECK: ^bb0(%[[IN:.+]]: f32, %[[UNUSED:.+]]: f32): - // CHECK: linalg.yield %[[IN]] : f32 - // CHECK: %[[INITB:.+]] = linalg.init_tensor [%[[DIM]], 6] - // CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%arg0, %[[TRANSPOSE]] : tensor, tensor<3x6xf32>) outs(%[[FILL]] : tensor) -> tensor - // CHECK: %[[ADDED:.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, %[[MATMUL]] : tensor<6xf32>, tensor) outs(%[[INITB]] : tensor) { - // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): - // CHECK: %[[ADD:.+]] = arith.addf %arg3, %arg4 : f32 - // CHECK: linalg.yield %[[ADD]] : f32 - - %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor, tensor<6x3xf32>, tensor<6xf32>) -> (tensor) - return %0 : tensor -} - -// ----- - func @pad_float(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) { %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32> // TODO: Output contains multiple "arith.constant 1 : index". @@ -1395,318 +1247,6 @@ // ----- -// CHECK-LABEL: @max_pool -func @max_pool(%arg0: tensor<1x6x34x62xf32>) -> () { - // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 - // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 32, 62] - // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[CONST]], [[INIT]]) - // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3] - // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, [[KERNEL]] : tensor<1x6x34x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x32x62xf32>) - %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>) -> (tensor<1x4x32x62xf32>) - return -} - -// CHECK-LABEL: @max_pool_padded -func @max_pool_padded(%arg0: tensor<1x6x34x62xf32>) -> () { - // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 : f32 - // CHECK-DAG: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0] - // CHECK-DAG: linalg.yield [[CONST]] - // CHECK-DAG: [[INITVAL:%.+]] = arith.constant -3.40282347E+38 : f32 - // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 33, 62] - // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INITVAL]], [[INIT]]) - // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3] - // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x6x35x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x33x62xf32>) - %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 1], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>) -> (tensor<1x4x33x62xf32>) - return -} - -// CHECK-LABEL: @max_pool_i8 -func @max_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> () { - // CHECK: arith.constant -128 - // CHECK: linalg.pooling_nhwc_max - %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi8>) -> (tensor<1x4x32x62xi8>) - return -} - -// CHECK-LABEL: @max_pool_i16 -func @max_pool_i16(%arg0: tensor<1x6x34x62xi16>) -> () { - // CHECK: arith.constant -32768 - // CHECK: linalg.pooling_nhwc_max - %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi16>) -> (tensor<1x4x32x62xi16>) - return -} - -// CHECK-LABEL: @max_pool_i32 -func @max_pool_i32(%arg0: tensor<1x6x34x62xi32>) -> () { - // CHECK: arith.constant -2147483648 - // CHECK: linalg.pooling_nhwc_max - %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi32>) -> (tensor<1x4x32x62xi32>) - return -} -// ----- - -// CHECK-LABEL: @avg_pool -func @avg_pool(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) { - // Initial piece computes the sum of the pooling region, with appropriate padding. - // CHECK: [[CONST:%.+]] = arith.constant 0 - // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] - // CHECK: [[CONST:%.+]] = arith.constant 0 - // CHECK: [[POOLINIT:%.+]] = linalg.init_tensor [1, 5, 33, 62] - // CHECK: [[FILL:%.+]] = linalg.fill([[CONST]], [[POOLINIT]]) - // CHECK: [[KERNEL:%.+]] = linalg.init_tensor [4, 4] - // CHECK: [[POOL:%.+]] = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x8x36x62xf32>, tensor<4x4xf32>) outs([[FILL]] : tensor<1x5x33x62xf32>) - // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 33, 62] - // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[POOL]] : tensor<1x5x33x62xf32>) outs([[INIT]] : tensor<1x5x33x62xf32>) - // CHECK: [[ZERO:%.0]] = arith.constant 0 - // CHECK: [[ONE:%.+]] = arith.constant 1 - // CHECK: [[HEIGHT:%.+]] = arith.constant 4 - // CHECK: [[WIDTH:%.+]] = arith.constant 32 - // CHECK: [[IDX1:%.+]] = linalg.index 1 - // CHECK: [[IDX2:%.+]] = linalg.index 2 - - // The large block below computes what portion of the kernel is within non-padded input. - // CHECK: [[NY:%.+]] = arith.subi [[HEIGHT]], [[IDX1]] - // CHECK: [[NX:%.+]] = arith.subi [[WIDTH]], [[IDX2]] - // CHECK: [[KH:%.+]] = arith.constant 4 - // CHECK: [[PAD0:%.+]] = arith.constant 1 - // CHECK: [[SUBP0:%.+]] = arith.subi [[IDX1]], [[PAD0]] - // CHECK: [[P0CMP:%.+]] = arith.cmpi slt, [[SUBP0]], [[ZERO]] - // CHECK: [[SELP0:%.+]] = select [[P0CMP]], [[SUBP0]], [[ZERO]] - // CHECK: [[ADDP0:%.+]] = arith.addi [[KH]], [[SELP0]] - // CHECK: [[PAD1:%.+]] = arith.constant 1 - // CHECK: [[SUBP1:%.+]] = arith.subi [[NY]], [[PAD1]] - // CHECK: [[P1CMP:%.+]] = arith.cmpi slt, [[SUBP1]], [[ZERO]] - // CHECK: [[SELP1:%.+]] = select [[P1CMP]], [[SUBP1]], [[ZERO]] - // CHECK: [[ADDP1:%.+]] = arith.addi [[ADDP0]], [[SELP1]] - // CHECK: [[YCMP:%.+]] = arith.cmpi slt, [[ADDP1]], [[ONE]] - // CHECK: [[YSEL:%.+]] = select [[YCMP]], [[ONE]], [[ADDP1]] - // CHECK: [[KW:%.+]] = arith.constant 4 : index - // CHECK: [[PAD2:%.+]] = arith.constant 1 : index - // CHECK: [[SUBP2:%.+]] = arith.subi [[IDX2]], [[PAD2]] - // CHECK: [[P2CMP:%.+]] = arith.cmpi slt, [[SUBP2]], [[ZERO]] - // CHECK: [[SELP2:%.+]] = select [[P2CMP]], [[SUBP2]], [[ZERO]] - // CHECK: [[ADDP2:%.+]] = arith.addi [[KW]], [[SELP2]] - // CHECK: [[PAD3:%.+]] = arith.constant 1 : index - // CHECK: [[SUBP3:%.+]] = arith.subi [[NX]], [[PAD3]] - // CHECK: [[P3CMP:%.+]] = arith.cmpi slt, [[SUBP3]], [[ZERO]] - // CHECK: [[SELP3:%.+]] = select [[P3CMP]], [[SUBP3]], [[ZERO]] - // CHECK: [[ADDP3:%.+]] = arith.addi [[ADDP2]], [[SELP3]] - // CHECK: [[XCMP:%.+]] = arith.cmpi slt, [[ADDP3]], [[ONE]] - // CHECK: [[XSEL:%.+]] = select [[XCMP]], [[ONE]], [[ADDP3]] - - // Given the valid coverage of the pooling region, normalize the summation. - // CHECK: [[C:%.+]] = arith.muli [[YSEL]], [[XSEL]] - // CHECK: [[CI:%.+]] = arith.index_cast [[C]] - // CHECK: [[CF:%.+]] = arith.sitofp [[CI]] - // CHECK: [[RESULT:%.+]] = arith.divf %arg1, [[CF]] - // CHECK: linalg.yield [[RESULT]] - %0 = "tosa.avg_pool2d"(%arg0) {pad = [1, 1, 1, 1], kernel = [4, 4], stride = [1, 1]} : (tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) - return %0 : tensor<1x5x33x62xf32> -} - -// ----- - -// CHECK-LABEL: @avg_pool_i8 -func @avg_pool_i8(%arg0 : tensor<1x128x128x2xi8>) -> () { - - // CHECK: linalg.pooling_nhwc_sum - // CHECK: linalg.generic - - // CHECK: %[[INZP:.+]] = arith.constant -128 - // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]] - // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]] - // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825 - // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}} - // CHECK: %[[SHIFT:.+]] = arith.constant 30 - // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false} - // CHECK: %[[OUTZP:.+]] = arith.constant -128 - // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]] - // CHECK: %[[MIN:.+]] = arith.constant -128 - // CHECK: %[[MAX:.+]] = arith.constant 127 - // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]] - // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]] - // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]] - // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]] - // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]] - // CHECK: linalg.yield %[[TRUNC]] - %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi8>) -> tensor<1x32x32x2xi8> - return -} - -// ----- - -// CHECK-LABEL: @avg_pool_i16 -func @avg_pool_i16(%arg0 : tensor<1x128x128x2xi16>) -> () { - - // CHECK: linalg.pooling_nhwc_sum - // CHECK: linalg.generic - - // CHECK: %[[INZP:.+]] = arith.constant -128 - // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]] - // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]] - // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825 - // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}} - // CHECK: %[[SHIFT:.+]] = arith.constant 30 - // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false} - // CHECK: %[[OUTZP:.+]] = arith.constant -128 - // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]] - // CHECK: %[[MIN:.+]] = arith.constant -32768 - // CHECK: %[[MAX:.+]] = arith.constant 32767 - // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]] - // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]] - // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]] - // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]] - // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]] - // CHECK: linalg.yield %[[TRUNC]] - %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi16>) -> tensor<1x32x32x2xi16> - return -} - -// ----- - -// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)> -// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)> - -// CHECK-LABEL: @conv2d_f32 -func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () { - // CHECK: %[[W_IN:.+]] = linalg.init_tensor [3, 3, 27, 28] - // CHECK: %[[W:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[W_IN]] : tensor<3x3x27x28xf32>) - // CHECK: linalg.yield %arg3 : f32 - // CHECK: %[[M_IN:.+]] = linalg.init_tensor [1, 45, 40, 28] - // CHECK: %[[CST:.+]] = arith.constant 0 - // CHECK: %[[FILL:.+]] = linalg.fill - // CHECK: %[[B_IN:.+]] = linalg.init_tensor [1, 45, 40, 28] - // CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[W]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%[[FILL]] : tensor<1x45x40x28xf32>) - // CHECK: %[[B:.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<28xf32>, tensor<1x45x40x28xf32>) outs(%[[B_IN]] : tensor<1x45x40x28xf32>) - // CHECK: arith.addf - // CHECK: linalg.yield %7 : f32 - %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [0, 0, 0, 0], stride = [1, 1], dilation = [2, 1]} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> (tensor<1x45x40x28xf32>) - return -} - -// ----- - -// CHECK-LABEL: @conv2d_padded_f32 -func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () { - // CHECK: %[[C0:.+]] = arith.constant 0 - // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] - // CHECK: linalg.yield %[[C0]] - // CHECK: linalg.conv_2d_nhwc_hwcf - %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [1, 1, 1, 1], stride = [1, 1], dilation = [2, 1]} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> (tensor<1x45x40x28xf32>) - return -} - -// ----- - -// CHECK-LABEL: @conv2d_quant -func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1xi8>, %arg2 : tensor<1024xi32>) -> () { - // CHECK: %[[C22:.+]] = arith.constant -22 - // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] - // CHECK: linalg.yield %[[C22]] - // CHECK: linalg.conv_2d_nhwc_hwcf_q - %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = -22 : i32, weight_zp = 42 : i32}, stride = [1, 1]} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32> - return -} - -// ----- - -// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> -// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -// CHECK-LABEL: @depthwise_conv -func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () { - // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11] - // CHECK: [[CST0:%.+]] = arith.constant 0 - // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) - // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33] - // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>) - // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]] - // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) { - // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors - // CHECK: [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32 - // CHECK: linalg.yield [[ADD]] : f32 - // CHECK: } -> tensor<1x5x5x33xf32> - %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [1, 1], dilation = [1, 1] } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>) -> (tensor<1x5x5x33xf32>) - return -} - -// ----- - -// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> -// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -// CHECK-LABEL: @depthwise_conv_strides -func @depthwise_conv_strides(%arg0 : tensor<1x11x9x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () { - // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11] - // CHECK: [[CST0:%.+]] = arith.constant 0 - // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) - // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33] - // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>) - // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]] - // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) { - // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors - // CHECK: [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32 - // CHECK: linalg.yield [[ADD]] : f32 - // CHECK: } -> tensor<1x5x5x33xf32> - %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [2, 2], dilation = [1, 1] } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>) -> (tensor<1x5x5x33xf32>) - return -} - -// ----- - -// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> -// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -// CHECK-LABEL: @depthwise_conv_quant -func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () { - // CHECK: [[PADV:%.+]] = arith.constant -128 - // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] - // CHECK: linalg.yield [[PADV]] - - // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 12, 12, 4, 128] - // CHECK: [[CST0:%.+]] = arith.constant 0 - // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) - // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 12, 12, 512] - // CHECK: [[C128:%.+]] = arith.constant -128 - // CHECK: [[C42:%.+]] = arith.constant 42 - // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x12x12x4x128xi32>) - // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]] - // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x12x12x512xi32>) outs([[OUT]] : tensor<1x12x12x512xi32>) { - // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32): // no predecessors - // CHECK: [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32 - // CHECK: linalg.yield [[ADD]] : i32 - // CHECK: } -> tensor<1x12x12x512xi32> - %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [1, 1, 1, 1], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [1, 1] } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x12x12x512xi32> - return -} - -// ----- - -// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> -// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -// CHECK-LABEL: @depthwise_conv_quant_dilations -func @depthwise_conv_quant_dilations(%arg0 : tensor<1x14x14x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () { - // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 10, 10, 4, 128] - // CHECK: [[CST0:%.+]] = arith.constant 0 - // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]]) - // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 10, 10, 512] - // CHECK: [[C128:%.+]] = arith.constant -128 - // CHECK: [[C42:%.+]] = arith.constant 42 - // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x10x10x4x128xi32>) - // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]] - // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x10x10x512xi32>) outs([[OUT]] : tensor<1x10x10x512xi32>) { - // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32): // no predecessors - // CHECK: [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32 - // CHECK: linalg.yield [[ADD]] : i32 - // CHECK: } -> tensor<1x10x10x512xi32> - %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [2, 2] } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x10x10x512xi32> - return -} - -// ----- - // CHECK-LABEL: @resize_nearest func @resize_nearest(%input: tensor<1x2x2x1xf32>) -> () { // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 4, 4, 1]