diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -628,10 +628,10 @@ scalar_arg: B --- !LinalgOpConfig metadata: !LinalgOpMetadata - name: conv_2d_input_nhwc_filter_ohwi_poly - cpp_class_name: Conv2DInputNhwcFilterOhwiPolyOp + name: conv_2d_nchw + cpp_class_name: Conv2DNchwOp doc: |- - Performs a 2-D convolution. + Performs 2-D convolution. Numeric casting is performed on the operands to the inner multiply, promoting them to the same data type as the accumulator/output. @@ -648,13 +648,13 @@ usage: InputOperand type_var: T2 shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] - -> (s4, s5, s6, s3)> + -> (s4, s1, s5, s6)> - !LinalgOperandDefConfig name: O usage: OutputOperand type_var: U shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] - -> (s0, s7, s8, s4)> + -> (s0, s4, s7, s8, s1)> - !LinalgOperandDefConfig name: strides usage: IndexAttribute @@ -670,18 +670,18 @@ indexing_maps: !LinalgIndexingMapsConfig static_indexing_maps: - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d6)> + s9, s10, s11, s12] -> (d0, d4, d2 * s9 + d5 * s11, d3 * s10 + d6 * s12)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d5, d3, d4, d6)> + s9, s10, s11, s12] -> (d1, d4, d5, d6)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1, d2, d5)> + s9, s10, s11, s12] -> (d0, d1, d2, d3)> iterator_types: - parallel - parallel - parallel + - parallel - reduction - reduction - - parallel - reduction assignments: - !ScalarAssign @@ -710,14 +710,13 @@ scalar_arg: K --- !LinalgOpConfig metadata: !LinalgOpMetadata - name: conv_2d_input_nhwc_filter_ohwi_poly_q - cpp_class_name: Conv2DInputNhwcFilterOhwiPolyQOp + name: conv_2d_nhwc_hwcf + cpp_class_name: Conv2DNhwcHwcfOp doc: |- - Performs a 2-D quantized convolution. + Performs 2-D convolution. Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. Includes zero point - adjustment for quantization. + them to the same data type as the accumulator/output. structured_op: !LinalgStructuredOpConfig args: - !LinalgOperandDefConfig @@ -731,21 +730,13 @@ usage: InputOperand type_var: T2 shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] - -> (s4, s5, s6, s3)> - - !LinalgOperandDefConfig - name: IZp - usage: InputOperand - type_var: I32 - - !LinalgOperandDefConfig - name: KZp - usage: InputOperand - type_var: I32 + -> (s4, s5, s3, s6)> - !LinalgOperandDefConfig name: O usage: OutputOperand type_var: U shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] - -> (s0, s7, s8, s4)> + -> (s0, s7, s8, s6)> - !LinalgOperandDefConfig name: strides usage: IndexAttribute @@ -761,22 +752,18 @@ indexing_maps: !LinalgIndexingMapsConfig static_indexing_maps: - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d6)> - - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d5, d3, d4, d6)> + s9, s10, s11, s12] -> (d0, d1 * s9 + d4 * s11, d2 * s10 + d5 * s12, d6)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> ()> - - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> ()> + s9, s10, s11, s12] -> (d4, d5, d6, d3)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1, d2, d5)> + s9, s10, s11, s12] -> (d0, d1, d2, d3)> iterator_types: - parallel - parallel - parallel + - parallel - reduction - reduction - - parallel - reduction assignments: - !ScalarAssign @@ -792,37 +779,17 @@ fn_name: mul operands: - !ScalarExpression - scalar_apply: - fn_name: sub + symbolic_cast: + type_var: U operands: - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: I - - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: IZp + scalar_arg: I - !ScalarExpression - scalar_apply: - fn_name: sub + symbolic_cast: + type_var: U operands: - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: K - - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: KZp + scalar_arg: K --- !LinalgOpConfig metadata: !LinalgOpMetadata name: depthwise_conv_2d_input_nhwc_filter_hwc_poly @@ -906,13 +873,14 @@ scalar_arg: K --- !LinalgOpConfig metadata: !LinalgOpMetadata - name: depthwise_conv_2D_nchw - cpp_class_name: DepthwiseConv2DNchwOp + name: conv_2d_nhwc_hwcf_q + cpp_class_name: Conv2DNhwcHwcfQOp doc: |- - Performs depth-wise 2-D convolution. + Performs 2-D convolution with zero point offsets. Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. + them to the same data type as the accumulator/output. This includes the zero + point offsets common to quantized operations. structured_op: !LinalgStructuredOpConfig args: - !LinalgOperandDefConfig @@ -927,12 +895,20 @@ type_var: T2 shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] -> (s4, s5, s3, s6)> + - !LinalgOperandDefConfig + name: IZp + usage: InputOperand + type_var: I32 + - !LinalgOperandDefConfig + name: KZp + usage: InputOperand + type_var: I32 - !LinalgOperandDefConfig name: O usage: OutputOperand type_var: U shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] - -> (s0, s7, s8, s3, s6)> + -> (s0, s7, s8, s6)> - !LinalgOperandDefConfig name: strides usage: IndexAttribute @@ -948,19 +924,23 @@ indexing_maps: !LinalgIndexingMapsConfig static_indexing_maps: - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d5)> + s9, s10, s11, s12] -> (d0, d1 * s9 + d4 * s11, d2 * s10 + d5 * s12, d6)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d3, d4, d5, d6)> + s9, s10, s11, s12] -> (d4, d5, d6, d3)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1, d2, d5, d6)> + s9, s10, s11, s12] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10, s11, s12] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10, s11, s12] -> (d0, d1, d2, d3)> iterator_types: - parallel - parallel - parallel + - parallel + - reduction - reduction - reduction - - parallel - - parallel assignments: - !ScalarAssign arg: O @@ -975,21 +955,41 @@ fn_name: mul operands: - !ScalarExpression - symbolic_cast: - type_var: U + scalar_apply: + fn_name: sub operands: - !ScalarExpression - scalar_arg: I + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: I + - !ScalarExpression + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: IZp - !ScalarExpression - symbolic_cast: - type_var: U + scalar_apply: + fn_name: sub operands: - !ScalarExpression - scalar_arg: K + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: K + - !ScalarExpression + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: KZp --- !LinalgOpConfig metadata: !LinalgOpMetadata - name: depthwise_conv2D_nchw_q - cpp_class_name: DepthwiseConv2DNchwQOp + name: depthwise_conv2D_nchw + cpp_class_name: DepthwiseConv2DNchwOp doc: |- Performs depth-wise 2-D convolution. @@ -1009,14 +1009,6 @@ type_var: T2 shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] -> (s4, s5, s3, s6)> - - !LinalgOperandDefConfig - name: IZp - usage: InputOperand - type_var: I32 - - !LinalgOperandDefConfig - name: KZp - usage: InputOperand - type_var: I32 - !LinalgOperandDefConfig name: O usage: OutputOperand @@ -1041,10 +1033,6 @@ s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d5)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] -> (d3, d4, d5, d6)> - - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> ()> - - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> ()> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] -> (d0, d1, d2, d5, d6)> iterator_types: @@ -1069,43 +1057,23 @@ fn_name: mul operands: - !ScalarExpression - scalar_apply: - fn_name: sub + symbolic_cast: + type_var: U operands: - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: I - - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: IZp + scalar_arg: I - !ScalarExpression - scalar_apply: - fn_name: sub + symbolic_cast: + type_var: U operands: - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: K - - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: KZp + scalar_arg: K --- !LinalgOpConfig metadata: !LinalgOpMetadata - name: conv_2d_nchw - cpp_class_name: Conv2DNchwOp + name: depthwise_conv2D_nchw_q + cpp_class_name: DepthwiseConv2DNchwQOp doc: |- - Performs 2-D convolution. + Performs depth-wise 2-D convolution. Numeric casting is performed on the operands to the inner multiply, promoting them to the same data type as the accumulator/output. @@ -1122,13 +1090,21 @@ usage: InputOperand type_var: T2 shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] - -> (s4, s1, s5, s6)> + -> (s4, s5, s3, s6)> + - !LinalgOperandDefConfig + name: IZp + usage: InputOperand + type_var: I32 + - !LinalgOperandDefConfig + name: KZp + usage: InputOperand + type_var: I32 - !LinalgOperandDefConfig name: O usage: OutputOperand type_var: U shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] - -> (s0, s4, s7, s8, s1)> + -> (s0, s7, s8, s3, s6)> - !LinalgOperandDefConfig name: strides usage: IndexAttribute @@ -1144,19 +1120,23 @@ indexing_maps: !LinalgIndexingMapsConfig static_indexing_maps: - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d4, d2 * s9 + d5 * s11, d3 * s10 + d6 * s12)> + s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d5)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d1, d4, d5, d6)> + s9, s10, s11, s12] -> (d3, d4, d5, d6)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1, d2, d3)> + s9, s10, s11, s12] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10, s11, s12] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10, s11, s12] -> (d0, d1, d2, d5, d6)> iterator_types: - parallel - parallel - parallel - - parallel - - reduction - reduction - reduction + - parallel + - parallel assignments: - !ScalarAssign arg: O @@ -1171,17 +1151,37 @@ fn_name: mul operands: - !ScalarExpression - symbolic_cast: - type_var: U + scalar_apply: + fn_name: sub operands: - !ScalarExpression - scalar_arg: I + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: I + - !ScalarExpression + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: IZp - !ScalarExpression - symbolic_cast: - type_var: U + scalar_apply: + fn_name: sub operands: - !ScalarExpression - scalar_arg: K + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: K + - !ScalarExpression + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: KZp --- !LinalgOpConfig metadata: !LinalgOpMetadata name: pooling_nhwc_sum @@ -1896,3 +1896,4 @@ operands: - !ScalarExpression scalar_arg: I + diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -849,104 +849,213 @@ return success(); } -static LogicalResult -convolutionMatchAndRewriterHelper(Operation *op, - ConversionPatternRewriter &rewriter) { - Location loc = op->getLoc(); - Value input = op->getOperand(0); - Value weight = op->getOperand(1); - Value bias = op->getOperand(2); +namespace { - ShapedType inputTy = input.getType().cast(); - ShapedType weightTy = weight.getType().cast(); - ShapedType biasTy = bias.getType().cast(); - ShapedType resultTy = op->getResult(0).getType().cast(); +template +class PointwiseConverter : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; - Type inputETy = inputTy.getElementType(); - Type resultETy = resultTy.getElementType(); - - auto padAttr = op->getAttr("pad").cast(); - auto strideTosaAttr = op->getAttr("stride").cast(); - auto dilationTosaAttr = op->getAttr("dilation").cast(); - - bool isQuantized = op->hasAttr("quantization_info"); - IntegerAttr iZp; - IntegerAttr kZp; - if (isQuantized) { - auto quantizationInfo = - op->getAttr("quantization_info").cast(); - iZp = rewriter.getI32IntegerAttr( - quantizationInfo.input_zp().getValue().getSExtValue()); - kZp = rewriter.getI32IntegerAttr( - quantizationInfo.weight_zp().getValue().getSExtValue()); + LogicalResult matchAndRewrite(SrcOp op, + PatternRewriter &rewriter) const final { + return elementwiseMatchAndRewriteHelper(op, rewriter); } +}; - if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || - !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) - return rewriter.notifyMatchFailure(op, - "tosa.conv ops require static shapes"); +class ConvConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(tosa::Conv2DOp op, ArrayRef args, + ConversionPatternRewriter &rewriter) const final { + Location loc = op->getLoc(); + Value input = op->getOperand(0); + Value weight = op->getOperand(1); + Value bias = op->getOperand(2); - auto weightShape = weightTy.getShape(); - auto resultShape = resultTy.getShape(); + ShapedType inputTy = input.getType().cast(); + ShapedType weightTy = weight.getType().cast(); + ShapedType biasTy = bias.getType().cast(); + ShapedType resultTy = op->getResult(0).getType().cast(); - // Apply padding as necessary. - Attribute zeroAttr = rewriter.getZeroAttr(inputETy); - llvm::SmallVector pad; - pad.resize(2, 0); - getValuesFromIntArrayAttribute(padAttr, pad); - pad.resize(pad.size() + 2, 0); + Type inputETy = inputTy.getElementType(); + Type resultETy = resultTy.getElementType(); - input = applyPad(loc, input, pad, zeroAttr, rewriter); + auto padAttr = op->getAttr("pad").cast(); + auto strideTosaAttr = op->getAttr("stride").cast(); + auto dilationTosaAttr = op->getAttr("dilation").cast(); + bool isQuantized = op->hasAttr("quantization_info"); - // Broadcast the initial value to the output tensor before convolving. - SmallVector indexingMaps; - indexingMaps.push_back(AffineMap::get( - /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, - {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || + !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) + return rewriter.notifyMatchFailure(op, + "tosa.conv ops require static shapes"); - Value initTensor = rewriter.create( - loc, resultTy.getShape(), resultTy.getElementType()); + auto weightShape = weightTy.getShape(); - Value biasBroadcast = - rewriter - .create( - loc, resultTy, bias, initTensor, indexingMaps, - getNParallelLoopsAttrs(resultTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - nestedBuilder.create(nestedLoc, args[0]); - }) - .getResult(0); - - // Extract the attributes for convolution. - llvm::SmallVector stride, dilation; - getValuesFromIntArrayAttribute(strideTosaAttr, stride); - getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); - - // Create the convolution op. - auto strideAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), stride); - auto dilationAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), dilation); - - if (isa(op) && !isQuantized) { - rewriter.replaceOpWithNewOp( + // Apply padding as necessary. + Attribute zeroAttr = rewriter.getZeroAttr(inputETy); + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(padAttr, pad); + pad.resize(pad.size() + 2, 0); + input = applyPad(loc, input, pad, zeroAttr, rewriter); + + // Transpose the kernel to match dimension ordering of the linalg + // convolution operation. + // TODO(suderman): See if this can be efficiently folded - check whether + // the input is used anywhere else, if not fold the constant. + SmallVector weightPerm{1, 2, 3, 0}; + SmallVector newWeightShape{weightShape[1], weightShape[2], + weightShape[3], weightShape[0]}; + auto weightPermAttr = DenseIntElementsAttr::get( + RankedTensorType::get({4}, rewriter.getI64Type()), weightPerm); + Value weightPermValue = rewriter.create(loc, weightPermAttr); + Type newWeightTy = + RankedTensorType::get(newWeightShape, weightTy.getElementType()); + weight = rewriter.create(loc, newWeightTy, weight, + weightPermValue); + + // Broadcast the initial value to the output tensor before convolving. + SmallVector indexingMaps; + indexingMaps.push_back(AffineMap::get( + /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, + {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + + Value initTensor = rewriter.create( + loc, resultTy.getShape(), resultETy); + + Value biasBroadcast = + rewriter + .create( + loc, resultTy, bias, initTensor, indexingMaps, + getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + nestedBuilder.create(nestedLoc, args[0]); + }) + .getResult(0); + + // Extract the attributes for convolution. + llvm::SmallVector stride, dilation; + getValuesFromIntArrayAttribute(strideTosaAttr, stride); + getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); + + // Create the convolution op. + auto strideAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), stride); + auto dilationAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), dilation); + + Value conv; + if (isQuantized) { + auto quantizationInfo = + op->getAttr("quantization_info").cast(); + auto iZp = rewriter.getI32IntegerAttr( + quantizationInfo.input_zp().getValue().getSExtValue()); + auto kZp = rewriter.getI32IntegerAttr( + quantizationInfo.weight_zp().getValue().getSExtValue()); + + auto iZpVal = rewriter.create(loc, iZp); + auto kZpVal = rewriter.create(loc, kZp); + rewriter.replaceOpWithNewOp( + op, resultTy, ValueRange{input, weight, iZpVal, kZpVal}, + ValueRange{biasBroadcast}, strideAttr, dilationAttr); + return success(); + } + + rewriter.replaceOpWithNewOp( op, resultTy, ValueRange{input, weight}, ValueRange{biasBroadcast}, strideAttr, dilationAttr); return success(); } +}; - if (isa(op) && isQuantized) { - auto iZpVal = rewriter.create(loc, iZp); - auto kZpVal = rewriter.create(loc, kZp); - rewriter.replaceOpWithNewOp( - op, resultTy, ValueRange{input, weight, iZpVal, kZpVal}, - ValueRange{biasBroadcast}, strideAttr, dilationAttr); - return success(); - } +class DepthwiseConvConverter + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(tosa::DepthwiseConv2DOp op, ArrayRef args, + ConversionPatternRewriter &rewriter) const final { + Location loc = op->getLoc(); + Value input = op->getOperand(0); + Value weight = op->getOperand(1); + Value bias = op->getOperand(2); + + ShapedType inputTy = input.getType().cast(); + ShapedType weightTy = weight.getType().cast(); + ShapedType biasTy = bias.getType().cast(); + ShapedType resultTy = op->getResult(0).getType().cast(); - if (isa(op)) { + Type inputETy = inputTy.getElementType(); + Type resultETy = resultTy.getElementType(); + + auto padAttr = op->getAttr("pad").cast(); + auto strideTosaAttr = op->getAttr("stride").cast(); + auto dilationTosaAttr = op->getAttr("dilation").cast(); + + bool isQuantized = op->hasAttr("quantization_info"); + IntegerAttr iZp; + IntegerAttr kZp; + if (isQuantized) { + auto quantizationInfo = + op->getAttr("quantization_info").cast(); + iZp = rewriter.getI32IntegerAttr( + quantizationInfo.input_zp().getValue().getSExtValue()); + kZp = rewriter.getI32IntegerAttr( + quantizationInfo.weight_zp().getValue().getSExtValue()); + } + + if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || + !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) + return rewriter.notifyMatchFailure(op, + "tosa.conv ops require static shapes"); + + auto weightShape = weightTy.getShape(); + auto resultShape = resultTy.getShape(); + + // Apply padding as necessary. + Attribute zeroAttr = rewriter.getZeroAttr(inputETy); + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(padAttr, pad); + pad.resize(pad.size() + 2, 0); + + input = applyPad(loc, input, pad, zeroAttr, rewriter); + + // Broadcast the initial value to the output tensor before convolving. + SmallVector indexingMaps; + indexingMaps.push_back(AffineMap::get( + /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, + {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + + Value initTensor = + rewriter.create(loc, resultShape, resultETy); + + Value biasBroadcast = + rewriter + .create( + loc, resultTy, bias, initTensor, indexingMaps, + getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + nestedBuilder.create(nestedLoc, args[0]); + }) + .getResult(0); + + // Extract the attributes for convolution. + llvm::SmallVector stride, dilation; + getValuesFromIntArrayAttribute(strideTosaAttr, stride); + getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); + + // Create the convolution op. + auto strideAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), stride); + auto dilationAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), dilation); ShapedType linalgConvTy = RankedTensorType::get({resultShape[0], resultShape[1], resultShape[2], weightShape[2], weightShape[3]}, @@ -976,32 +1085,6 @@ rewriter.replaceOp(op, reshape); return success(); } - - return failure(); -} - -namespace { - -template -class PointwiseConverter : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(SrcOp op, - PatternRewriter &rewriter) const final { - return elementwiseMatchAndRewriteHelper(op, rewriter); - } -}; - -template -class ConvConverter : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(T op, ArrayRef args, - ConversionPatternRewriter &rewriter) const final { - return convolutionMatchAndRewriterHelper(op, rewriter); - } }; class TransposeConvConverter @@ -2528,8 +2611,8 @@ ReduceConverter, ArgMaxConverter, ConcatConverter, - ConvConverter, - ConvConverter, + ConvConverter, + DepthwiseConvConverter, TransposeConvConverter, GatherConverter, PadConverter, diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -144,49 +144,39 @@ implements(ContractionOpInterface) C[None] += cast(U, A[D.m]) * cast(U, B[D.m]) - @linalg_structured_op -def conv_2d_input_nhwc_filter_ohwi_poly( - I=TensorDef(T1, S.N, S.IH, S.IW, S.IC), - K=TensorDef(T2, S.OC, S.KH, S.KW, S.IC), - O=TensorDef(U, S.N, S.OH, S.OW, S.OC, output=True), +def conv_2d_nchw( + I=TensorDef(T1, S.N, S.C, S.IH, S.IW), + K=TensorDef(T2, S.F, S.C, S.KH, S.KW), + O=TensorDef(U, S.N, S.F, S.OH, S.OW, S.C, output=True), strides=AttributeDef(S.SH, S.SW), dilations=AttributeDef(S.DH, S.DW)): - """Performs a 2-D convolution. + """Performs 2-D convolution. Numeric casting is performed on the operands to the inner multiply, promoting them to the same data type as the accumulator/output. """ - domain(D.n, D.oh, D.ow, D.kh, D.kw, D.oc, D.ic) - O[D.n, D.oh, D.ow, D.oc] += cast( - U, I[D.n, - D.oh * S.SH + D.kh * S.DH, - D.ow * S.SW + D.kw * S.DW, - D.ic]) * cast(U, K[D.oc, D.kh, D.kw, D.ic]) + domain(D.n, D.f, D.oh, D.ow, D.c, D.kh, D.kw) + O[D.n, D.f, D.oh, D.ow] += cast( + U, I[D.n, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, + ]) * cast(U, K[D.f, D.c, D.kh, D.kw]) @linalg_structured_op -def conv_2d_input_nhwc_filter_ohwi_poly_q( - I=TensorDef(T1, S.N, S.IH, S.IW, S.IC), - K=TensorDef(T2, S.OC, S.KH, S.KW, S.IC), - IZp=ScalarDef(I32), - KZp=ScalarDef(I32), - O=TensorDef(U, S.N, S.OH, S.OW, S.OC, output=True), +def conv_2d_nhwc_hwcf( + I=TensorDef(T1, S.N, S.IH, S.IW, S.C), + K=TensorDef(T2, S.KH, S.KW, S.C, S.F), + O=TensorDef(U, S.N, S.OH, S.OW, S.F, output=True), strides=AttributeDef(S.SH, S.SW), dilations=AttributeDef(S.DH, S.DW)): - """Performs a 2-D quantized convolution. + """Performs 2-D convolution. Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. Includes zero point - adjustment for quantization. + them to the same data type as the accumulator/output. """ - domain(D.n, D.oh, D.ow, D.kh, D.kw, D.oc, D.ic) - O[D.n, D.oh, D.ow, D.oc] += ((cast( - U, I[D.n, - D.oh * S.SH + D.kh * S.DH, - D.ow * S.SW + D.kw * S.DW, - D.ic]) - cast(U, IZp)) * - (cast(U, K[D.oc, D.kh, D.kw, D.ic]) - cast(U, KZp))) - + domain(D.n, D.oh, D.ow, D.f, D.kh, D.kw, D.c) + O[D.n, D.oh, D.ow, D.f] += cast( + U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.c + ]) * cast(U, K[D.kh, D.kw, D.c, D.f]) @linalg_structured_op def depthwise_conv_2d_input_nhwc_filter_hwc_poly( @@ -206,24 +196,27 @@ D.c]) * cast(U, K[D.kh, D.kw, D.c]) @linalg_structured_op -def conv_2d_nchw( - I=TensorDef(T1, S.N, S.C, S.IH, S.IW), - K=TensorDef(T2, S.F, S.C, S.KH, S.KW), - O=TensorDef(U, S.N, S.F, S.OH, S.OW, S.C, output=True), +def conv_2d_nhwc_hwcf_q( + I=TensorDef(T1, S.N, S.IH, S.IW, S.C), + K=TensorDef(T2, S.KH, S.KW, S.C, S.F), + IZp=ScalarDef(I32), + KZp=ScalarDef(I32), + O=TensorDef(U, S.N, S.OH, S.OW, S.F, output=True), strides=AttributeDef(S.SH, S.SW), dilations=AttributeDef(S.DH, S.DW)): - """Performs 2-D convolution. + """Performs 2-D convolution with zero point offsets. Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. + them to the same data type as the accumulator/output. This includes the zero + point offsets common to quantized operations. """ - domain(D.n, D.f, D.oh, D.ow, D.c, D.kh, D.kw) - O[D.n, D.f, D.oh, D.ow] += cast( - U, I[D.n, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, - ]) * cast(U, K[D.f, D.c, D.kh, D.kw]) + domain(D.n, D.oh, D.ow, D.f, D.kh, D.kw, D.c) + O[D.n, D.oh, D.ow, D.f] += (cast( + U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.c + ]) - cast(U, IZp)) * (cast(U, K[D.kh, D.kw, D.c, D.f]) - cast(U, KZp)) - -def depthwise_conv2D_nchw( #TODO: Fix name +@linalg_structured_op +def depthwise_conv2D_nchw( I=TensorDef(T1, S.N, S.IH, S.IW, S.IC), K=TensorDef(T2, S.KH, S.KW, S.IC, S.CM), O=TensorDef(U, S.N, S.OH, S.OW, S.IC, S.CM, output=True), @@ -239,8 +232,8 @@ U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.ic]) * cast(U, K[D.kh, D.kw, D.ic, D.cm]) - -def depthwise_conv2D_nchw_q( #TODO: Fix name +@linalg_structured_op +def depthwise_conv2D_nchw_q( I=TensorDef(T1, S.N, S.IH, S.IW, S.IC), K=TensorDef(T2, S.KH, S.KW, S.IC, S.CM), IZp=ScalarDef(I32), diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -1176,14 +1176,19 @@ // ----- -// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> -// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +// CHECK: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)> +// CHECK: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +// CHECK: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)> -// CHECK-LABEL: @conv2d_f32 +// CHECK-LABEL @conv2d_f32 func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () { - // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 45, 40, 28] - // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) - // CHECK: linalg.conv_2d_input_nhwc_filter_ohwi_poly {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%[[BROADCAST]] : tensor<1x45x40x28xf32>) + // CHECK: %[[W_IN:.+]] = linalg.init_tensor [3, 3, 27, 28] + // CHECK: %[[W:.+]] = linalg.generic {indexing_maps = [#[[MAP0]], #[[MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[W_IN]] : tensor<3x3x27x28xf32>) + // CHECK: linalg.yield %arg3 : f32 + // CHECK: %[[B_IN:.+]] = linalg.init_tensor [1, 45, 40, 28] + // CHECK: %[[B:.+]] = linalg.generic {indexing_maps = [#[[MAP2]], #[[MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[B_IN]] : tensor<1x45x40x28xf32>) + // CHECK: linalg.yield %arg3 : f32 + // CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %1 : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%[[B]] : tensor<1x45x40x28xf32>) %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [0, 0, 0, 0], stride = [1, 1], dilation = [2, 1]} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> (tensor<1x45x40x28xf32>) return } @@ -1192,26 +1197,17 @@ // CHECK-LABEL: @conv2d_padded_f32 func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () { - // CHECK: linalg.pad_tensor %arg0 - // CHECK: linalg.conv_2d_input_nhwc_filter_ohwi_poly + // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0] + // CHECK: linalg.conv_2d_nhwc_hwcf %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [1, 1, 1, 1], stride = [1, 1], dilation = [2, 1]} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> (tensor<1x45x40x28xf32>) return } // ----- -// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> -// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - // CHECK-LABEL: @conv2d_quant func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1xi8>, %arg2 : tensor<1024xi32>) -> () { - // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 10, 10, 1024] - // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<1024xi32>) outs(%[[INIT]] : tensor<1x10x10x1024xi32>) - // CHECK: ^bb0(%arg3: i32, %arg4: i32): - // CHECK: linalg.yield %arg3 : i32 - // CHECK: %[[C128:.+]] = constant -128 - // CHECK: %[[C42:.+]] = constant 42 - // CHECK: linalg.conv_2d_input_nhwc_filter_ohwi_poly_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, %[[C128]], %[[C42]] : tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, i32, i32) outs(%1 : tensor<1x10x10x1024xi32>) + // CHECK: linalg.conv_2d_nhwc_hwcf_q %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1]} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x10x10x1024xi32> return } @@ -1229,7 +1225,7 @@ // CHECK: linalg.yield %arg3 : f32 // CHECK: } -> tensor<1x5x5x33xf32> // CHECK: [[DBIAS:%.+]] = linalg.tensor_expand_shape [[BIAS]] {{\[}}[0], [1], [2], [3, 4]] - // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2D_nchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[DBIAS]] : tensor<1x5x5x3x11xf32>) + // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv2D_nchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[DBIAS]] : tensor<1x5x5x3x11xf32>) // CHECK: linalg.tensor_collapse_shape %3 {{\[}}[0], [1], [2], [3, 4]] %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [1, 1], dilation = [1, 1] } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>) -> (tensor<1x5x5x33xf32>) return @@ -1260,8 +1256,8 @@ // CHECK-LABEL: @transpose_conv func @transpose_conv(%arg0 : tensor<1x12x12x2xf32>, %arg1 : tensor<4x3x3x2xf32>, %arg2 : tensor<4xf32>) -> () { - // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0] - // CHECK: linalg.conv_2d_input_nhwc_filter_ohwi_poly {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], {{%.+}} : tensor<1x16x16x2xf32>, tensor<4x3x3x2xf32>) + // CHECK: linalg.pad_tensor %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0] + // CHECK: linalg.conv_2d_nhwc_hwcf %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], out_pad = [0, 0], out_shape = [1, 14, 14, 4], stride = [1, 1]} : (tensor<1x12x12x2xf32>, tensor<4x3x3x2xf32>, tensor<4xf32>) -> tensor<1x14x14x4xf32> return } @@ -1271,7 +1267,7 @@ // CHECK-LABEL: @transpose_conv_dilated func @transpose_conv_dilated(%arg0 : tensor<1x12x12x2xf32>, %arg1 : tensor<4x3x3x2xf32>, %arg2 : tensor<4xf32>) -> () { // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 4, 4, 0] high[0, 4, 4, 0] - // CHECK: linalg.conv_2d_input_nhwc_filter_ohwi_poly {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], {{%.+}} : tensor<1x20x20x2xf32>, tensor<4x3x3x2xf32>) + // CHECK: linalg.conv_2d_nhwc_hwcf {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], {{%.+}} : tensor<1x20x20x2xf32>, tensor<3x3x2x4xf32>) %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {dilation = [2, 2], out_pad = [0, 0], out_shape = [1, 16, 16, 4], stride = [1, 1]} : (tensor<1x12x12x2xf32>, tensor<4x3x3x2xf32>, tensor<4xf32>) -> tensor<1x16x16x4xf32> return } diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1,19 +1,5 @@ // RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s -// CHECK-LABEL: func @conv_2d_input_nhwc_filter_ohwi_poly_q_tensor -func @conv_2d_input_nhwc_filter_ohwi_poly_q_tensor(%input: tensor<2x4x5x3xi8>, %filter: tensor<2x2x2x3xi8>) -> tensor<2x3x4x2xi32> { - %zero = constant 0 : i32 - %init = linalg.init_tensor [2, 3, 4, 2] : tensor<2x3x4x2xi32> - %fill = linalg.fill(%zero, %init) : i32, tensor<2x3x4x2xi32> -> tensor<2x3x4x2xi32> - %c128 = constant -128 : i32 - %c42 = constant 42 : i32 - %0 = linalg.conv_2d_input_nhwc_filter_ohwi_poly_q - { dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> } - ins(%input, %filter, %c128, %c42 : tensor<2x4x5x3xi8>, tensor<2x2x2x3xi8>, i32, i32) - outs(%fill : tensor<2x3x4x2xi32>) -> tensor<2x3x4x2xi32> - return %0 : tensor<2x3x4x2xi32> -} - // CHECK-LABEL: func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor(%input: tensor<2x4x5x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x3x4x2x3xf32> { %zero = constant 0.000000e+00 : f32