diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -1223,15 +1223,15 @@ permutation or broadcasting. Elements whose corresponding mask element is `0` are masked out and replaced with `padding`. - An optional boolean array attribute `in_bounds` specifies for every vector - dimension if the transfer is guaranteed to be within the source bounds. - While the starting point of the transfer has to be in-bounds, accesses may - run out-of-bounds as indices increase. Broadcast dimensions must always be - in-bounds. If specified, the `in_bounds` array length has to be equal to the - vector rank. In absence of the attribute, accesses along all dimensions - (except for broadcasts) may run out-of-bounds. A `vector.transfer_read` can - be lowered to a simple load if all dimensions are specified to be within - bounds and no `mask` was specified. + An optional boolean array attribute `in_bounds` specifies for every tensor/ + memref dimension if the transfer is guaranteed to be within the source + bounds. While the starting point of the transfer has to be in-bounds, + accesses may run out-of-bounds as indices increase. If specified, the + `in_bounds` array length has to be equal to the rank of the source. In + absence of the attribute, accesses along all dimensions may run + out-of-bounds. A `vector.transfer_read` can be lowered to a simple load if + all dimensions are specified to be within bounds and no `mask` was + specified. This operation is called 'read' by opposition to 'load' because the super-vector granularity is generally not representable with a single @@ -1464,14 +1464,15 @@ specified to mask out elements. Elements whose corresponding mask element is `0` are masked out. - An optional boolean array attribute `in_bounds` specifies for every vector - dimension if the transfer is guaranteed to be within the source bounds. - While the starting point of the transfer has to be in-bounds, accesses may - run out-of-bounds as indices increase. If specified, the `in_bounds` array - length has to be equal to the vector rank. In absence of the attribute, - accesses along all dimensions may run out-of-bounds. A - `vector.transfer_write` can be lowered to a simple store if all dimensions - are specified to be within bounds and no `mask` was specified. + An optional boolean array attribute `in_bounds` specifies for every tensor/ + memref dimension if the transfer is guaranteed to be within the source + bounds. While the starting point of the transfer has to be in-bounds, + accesses may run out-of-bounds as indices increase. If specified, the + `in_bounds` array length has to be equal to the rank of the source. In + absence of the attribute, accesses along all dimensions may run + out-of-bounds. A `vector.transfer_write` can be lowered to a simple store + if all dimensions are specified to be within bounds and no `mask` was + specified. This operation is called 'write' by opposition to 'store' because the super-vector granularity is generally not representable with a single diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h @@ -122,7 +122,7 @@ /// memref.cast %alloc: memref to compatibleMemRefType /// scf.yield %4 : compatibleMemRefType, index, index // } -/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]} +/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]} /// ``` /// where `alloc` is a top of the function alloca'ed buffer of one vector. /// diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h @@ -89,8 +89,7 @@ /// memref.cast %alloc: memref to compatibleMemRefType /// scf.yield %4 : compatibleMemRefType, index, index // } -/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... -/// true]} +/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]} /// ``` /// where `alloc` is a top of the function alloca'ed buffer of one vector. /// diff --git a/mlir/include/mlir/Interfaces/VectorInterfaces.td b/mlir/include/mlir/Interfaces/VectorInterfaces.td --- a/mlir/include/mlir/Interfaces/VectorInterfaces.td +++ b/mlir/include/mlir/Interfaces/VectorInterfaces.td @@ -75,9 +75,8 @@ /*args=*/(ins "unsigned":$dim), /*methodBody=*/"", /*defaultImplementation=*/[{ - return $_op.isBroadcastDim(dim) - || ($_op.getInBounds() - && cast<::mlir::BoolAttr>(cast<::mlir::ArrayAttr>(*$_op.getInBounds())[dim]).getValue()); + return $_op.getInBounds() + && cast<::mlir::BoolAttr>(cast<::mlir::ArrayAttr>(*$_op.getInBounds())[dim]).getValue(); }] >, InterfaceMethod< @@ -216,7 +215,8 @@ /*args=*/(ins), /*methodBody=*/"", /*defaultImplementation=*/[{ - for (unsigned idx = 0, e = $_op.getTransferRank(); idx < e; ++idx) + for (unsigned idx = 0, e = $_op.getShapedType().getRank(); + idx < e; ++idx) if (!$_op.isDimInBounds(idx)) return true; return false; diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp --- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp +++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp @@ -107,6 +107,21 @@ } } +/// Calculate the in_bounds attribute of the new vector transfer op. The dropped +/// vector transfer dimension is now in-bounds; an scf.if check was generated +/// around the new transfer op. +template +static ArrayAttr getXferInBoundsAttr(OpBuilder &b, OpTy xferOp) { + SmallVector inBounds(xferOp.getShapedType().getRank(), false); + if (xferOp.getInBounds().has_value()) + inBounds = extractFromIntegerArrayAttr(xferOp.getInBoundsAttr()); + auto dim = unpackedDim(xferOp); + bool isBroadcast = !dim.has_value(); + if (!isBroadcast) + inBounds[*dim] = true; + return b.getBoolArrayAttr(inBounds); +} + static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal, Value value) { if (hasRetVal) { @@ -173,7 +188,7 @@ bool isBroadcast = !dim; // No in-bounds check for broadcasts. Location loc = xferOp.getLoc(); ImplicitLocOpBuilder lb(xferOp.getLoc(), b); - if (!xferOp.isDimInBounds(0) && !isBroadcast) { + if (!isBroadcast && !xferOp.isDimInBounds(*dim)) { Value memrefDim = vector::createOrFoldDimOp(b, loc, xferOp.getSource(), *dim); AffineExpr d0, d1; @@ -239,13 +254,6 @@ }); } -/// Given an ArrayAttr, return a copy where the first element is dropped. -static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) { - if (!attr) - return attr; - return ArrayAttr::get(b.getContext(), attr.getValue().drop_front()); -} - /// Add the pass label to a vector transfer op if its rank is not the target /// rank. template @@ -410,11 +418,10 @@ Location loc = xferOp.getLoc(); auto bufferType = dyn_cast(buffer.getType()); auto vecType = dyn_cast(bufferType.getElementType()); - auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); auto newXferOp = b.create( loc, vecType, xferOp.getSource(), xferIndices, AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), - xferOp.getPadding(), Value(), inBoundsAttr); + xferOp.getPadding(), Value(), getXferInBoundsAttr(b, xferOp)); maybeApplyPassLabel(b, newXferOp, options.targetRank); @@ -497,13 +504,12 @@ Location loc = xferOp.getLoc(); auto vec = b.create(loc, buffer, loadIndices); - auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); auto source = loopState.empty() ? xferOp.getSource() : loopState[0]; Type type = isTensorOp(xferOp) ? xferOp.getShapedType() : Type(); auto newXferOp = b.create( loc, type, vec, source, xferIndices, AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(), - inBoundsAttr); + getXferInBoundsAttr(b, xferOp)); maybeApplyPassLabel(b, newXferOp, options.targetRank); @@ -931,11 +937,10 @@ getInsertionIndices(xferOp, insertionIndices); insertionIndices.push_back(i); - auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); auto newXferOp = b.create( loc, newXferVecType, xferOp.getSource(), xferIndices, AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), - xferOp.getPadding(), Value(), inBoundsAttr); + xferOp.getPadding(), Value(), getXferInBoundsAttr(b, xferOp)); maybeAssignMask(b, xferOp, newXferOp, i); return b.create(loc, newXferOp, vec, insertionIndices); @@ -1059,11 +1064,10 @@ auto extracted = b.create(loc, vec, extractionIndices); - auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); auto newXferOp = b.create( loc, sourceType, extracted, source, xferIndices, AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(), - inBoundsAttr); + getXferInBoundsAttr(b, xferOp)); maybeAssignMask(b, xferOp, newXferOp, i); @@ -1214,7 +1218,8 @@ /// E.g.: /// ``` /// vector.transfer_write %vec, %A[%a, %b] -/// {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} +/// {permutation_map = affine_map<(d0, d1) -> (d0)>, +/// in_bounds = [true, true]} /// : vector<9xf32>, memref /// ``` /// Is rewritten to approximately the following pseudo-IR: diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -1201,8 +1201,14 @@ LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); LLVM_DEBUG(permutationMap.print(dbgs())); + // Non-transfer dims are in-bounds. + SmallVector inBounds(memRefType.getRank(), true); + for (AffineExpr expr : permutationMap.getResults()) + if (auto dimExpr = expr.dyn_cast()) + inBounds[dimExpr.getPosition()] = false; auto transfer = state.builder.create( - loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap); + loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap, + inBounds); // Register replacement for future uses in the scope. state.registerOpVectorReplacement(loadOp, transfer); @@ -1244,9 +1250,14 @@ LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); LLVM_DEBUG(permutationMap.print(dbgs())); + // Non-transfer dims are in-bounds. + SmallVector inBounds(memRefType.getRank(), true); + for (AffineExpr expr : permutationMap.getResults()) + if (auto dimExpr = expr.dyn_cast()) + inBounds[dimExpr.getPosition()] = false; auto transfer = state.builder.create( storeOp.getLoc(), vectorValue, storeOp.getMemRef(), indices, - permutationMap); + permutationMap, inBounds); LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer); // Register replacement for future uses in the scope. diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -640,7 +640,7 @@ // be in-bounds. if (auto maskOp = dyn_cast(write)) { auto maskedWriteOp = cast(maskOp.getMaskableOp()); - SmallVector inBounds(maskedWriteOp.getVectorType().getRank(), true); + SmallVector inBounds(maskedWriteOp.getShapedType().getRank(), true); maskedWriteOp.setInBoundsAttr(rewriter.getBoolArrayAttr(inBounds)); } @@ -1083,7 +1083,7 @@ // `tensor.extract_element` is always in-bounds, hence the following holds. auto dstRank = resultType.getRank(); auto srcRank = extractOp.getTensor().getType().getRank(); - SmallVector inBounds(dstRank, true); + SmallVector inBounds(srcRank, true); // 2a. Handle scalar broadcast access. if (memAccessKind == VectorMemoryAccessKind::ScalarBroadcast) { @@ -1337,15 +1337,21 @@ SmallVector indices(linalgOp.getShape(opOperand).size(), zero); + // Non-transfer dims are in-bounds, all others are out-of-bounds. + SmallVector inBounds(indices.size(), true); + for (AffineExpr expr : readMap.getResults()) + if (auto dimExpr = expr.dyn_cast()) + inBounds[dimExpr.getPosition()] = false; + Operation *read = rewriter.create( - loc, readType, opOperand->get(), indices, readMap); + loc, readType, opOperand->get(), indices, readMap, inBounds); read = state.maskOperation(rewriter, read, linalgOp, maskingMap); Value readValue = read->getResult(0); // 3.b. If masked, set in-bounds to true. Masking guarantees that the access // will be in-bounds. if (auto maskOp = dyn_cast(read)) { - SmallVector inBounds(readType.getRank(), true); + SmallVector inBounds(indices.size(), true); cast(maskOp.getMaskableOp()) .setInBoundsAttr(rewriter.getBoolArrayAttr(inBounds)); } @@ -1927,7 +1933,7 @@ /// is rewritten to: /// ``` /// %r = vector.transfer_read %src[%c0, %c0], %padding -/// {in_bounds = [true, true]} +/// {in_bounds = [false, false]} /// : tensor, vector<17x5xf32> /// ``` /// Note: By restricting this pattern to in-bounds TransferReadOps, we can be @@ -1956,7 +1962,7 @@ return failure(); rewriter.updateRootInPlace(xferOp, [&]() { - SmallVector inBounds(xferOp.getVectorType().getRank(), false); + SmallVector inBounds(xferOp.getShapedType().getRank(), false); xferOp->setAttr(xferOp.getInBoundsAttrName(), rewriter.getBoolArrayAttr(inBounds)); xferOp.getSourceMutable().assign(padOp.getSource()); @@ -2033,7 +2039,7 @@ // Insert the new TransferWriteOp at position of the old TransferWriteOp. rewriter.setInsertionPoint(xferOp); - SmallVector inBounds(xferOp.getVectorType().getRank(), false); + SmallVector inBounds(xferOp.getShapedType().getRank(), false); auto newXferOp = rewriter.replaceOpWithNewOp( xferOp, padOp.getSource().getType(), xferOp.getVector(), padOp.getSource(), xferOp.getIndices(), xferOp.getPermutationMapAttr(), @@ -2142,7 +2148,8 @@ /// %0 = vector.transfer_read %src[%c0, %c0], %padding /// : tensor, vector<17x5xf32> /// %r = vector.transfer_write %0, %dest[%a, %b, %c0, %c0] -/// {in_bounds = [true, true]} : vector<17x5xf32>, tensor +/// {in_bounds = [true, true, true, true]} : vector<17x5xf32>, +/// tensor /// ``` /// /// This rewrite is possible if: @@ -2208,7 +2215,7 @@ // source must fit into the destination at the specified offsets. auto writeIndices = ofrToIndexValues(rewriter, padOp.getLoc(), insertOp.getMixedOffsets()); - SmallVector inBounds(vecRank, true); + SmallVector inBounds(tensorRank, true); rewriter.replaceOpWithNewOp( insertOp, read, insertOp.getDest(), writeIndices, ArrayRef{inBounds}); @@ -2337,11 +2344,17 @@ // The `masked` attribute is only valid on this padded buffer. // When forwarding to vector.transfer_read, the attribute must be reset // conservatively. + + // in_bounds is explicitly reset. Non-transfer dims are in-bounds, all others + // are out-of-bounds. + SmallVector inBounds(xferOp.getIndices().size(), true); + for (AffineExpr expr : xferOp.getPermutationMap().getResults()) + if (auto dimExpr = expr.dyn_cast()) + inBounds[dimExpr.getPosition()] = false; Value res = rewriter.create( xferOp.getLoc(), xferOp.getVectorType(), in, xferOp.getIndices(), xferOp.getPermutationMapAttr(), xferOp.getPadding(), xferOp.getMask(), - // in_bounds is explicitly reset - /*inBoundsAttr=*/ArrayAttr()); + rewriter.getBoolArrayAttr(inBounds)); if (maybeFillOp) rewriter.eraseOp(maybeFillOp); @@ -2395,11 +2408,17 @@ // The `masked` attribute is only valid on this padded buffer. // When forwarding to vector.transfer_write, the attribute must be reset // conservatively. + + // in_bounds is explicitly reset. Non-transfer dims are in-bounds, all others + // are out-of-bounds. + SmallVector inBounds(xferOp.getIndices().size(), true); + for (AffineExpr expr : xferOp.getPermutationMap().getResults()) + if (auto dimExpr = expr.dyn_cast()) + inBounds[dimExpr.getPosition()] = false; rewriter.create( xferOp.getLoc(), xferOp.getVector(), out, xferOp.getIndices(), xferOp.getPermutationMapAttr(), xferOp.getMask(), - // in_bounds is explicitly reset - /*inBoundsAttr=*/ArrayAttr()); + rewriter.getBoolArrayAttr(inBounds)); rewriter.eraseOp(copyOp); rewriter.eraseOp(xferOp); diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp --- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp @@ -45,6 +45,32 @@ // Utility functions //===----------------------------------------------------------------------===// +/// Compute the new in_bounds attribute of a vector transfer op when folding a +/// potentially rank-reducing alias op into a vector transfer op. +template +static ArrayAttr +expandInBoundsToRank(Builder &b, OpTy xferOp, int64_t rank, + const llvm::SmallBitVector &projectedDimensions) { + SmallVector inBounds(xferOp.getShapedType().getRank(), false); + if (xferOp.getInBounds().has_value()) + inBounds = extractFromIntegerArrayAttr(xferOp.getInBoundsAttr()); + SmallVector expandedInBounds; + int64_t idx = 0; + for (int64_t i = 0; i < rank; ++i) { + if (projectedDimensions.test(i)) { + // This dimension was rank-reduced. It must be in-bounds after folding. + expandedInBounds.push_back(true); + continue; + } + + // Not a rank-reduced dim: take in_bounds from the xfer op. + assert(idx < inBounds.size() && "invalid rank"); + expandedInBounds.push_back(inBounds[idx++]); + } + assert(idx == inBounds.size() && "invalid rank"); + return b.getBoolArrayAttr(expandedInBounds); +} + /// Given the 'indices' of a load/store operation where the memref is a result /// of a expand_shape op, returns the indices w.r.t to the source memref of the /// expand_shape op. For example @@ -404,12 +430,14 @@ op, op.getType(), subViewOp.getSource(), sourceIndices); }) .Case([&](vector::TransferReadOp op) { + int64_t rank = subViewOp.getSourceType().getRank(); rewriter.replaceOpWithNewOp( op, op.getVectorType(), subViewOp.getSource(), sourceIndices, - AffineMapAttr::get(expandDimsToRank( - op.getPermutationMap(), subViewOp.getSourceType().getRank(), - subViewOp.getDroppedDims())), - op.getPadding(), /*mask=*/Value(), op.getInBoundsAttr()); + AffineMapAttr::get(expandDimsToRank(op.getPermutationMap(), rank, + subViewOp.getDroppedDims())), + op.getPadding(), /*mask=*/Value(), + expandInBoundsToRank(rewriter, op, rank, + subViewOp.getDroppedDims())); }) .Case([&](gpu::SubgroupMmaLoadMatrixOp op) { rewriter.replaceOpWithNewOp( @@ -533,12 +561,13 @@ op.getNontemporal()); }) .Case([&](vector::TransferWriteOp op) { + int64_t rank = subViewOp.getSourceType().getRank(); rewriter.replaceOpWithNewOp( op, op.getValue(), subViewOp.getSource(), sourceIndices, - AffineMapAttr::get(expandDimsToRank( - op.getPermutationMap(), subViewOp.getSourceType().getRank(), - subViewOp.getDroppedDims())), - op.getInBoundsAttr()); + AffineMapAttr::get(expandDimsToRank(op.getPermutationMap(), rank, + subViewOp.getDroppedDims())), + expandInBoundsToRank(rewriter, op, rank, + subViewOp.getDroppedDims())); }) .Case([&](gpu::SubgroupMmaStoreMatrixOp op) { rewriter.replaceOpWithNewOp( diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp --- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp @@ -84,6 +84,32 @@ return success(); } +/// Compute the new in_bounds attribute of a vector transfer op when folding a +/// potentially rank-reducing subset op into a vector transfer op. +template +static ArrayAttr +expandInBoundsToRank(Builder &b, OpTy xferOp, int64_t rank, + const llvm::SmallBitVector &projectedDimensions) { + SmallVector inBounds(xferOp.getShapedType().getRank(), false); + if (xferOp.getInBounds().has_value()) + inBounds = extractFromIntegerArrayAttr(xferOp.getInBoundsAttr()); + SmallVector expandedInBounds; + int64_t idx = 0; + for (int64_t i = 0; i < rank; ++i) { + if (projectedDimensions.test(i)) { + // This dimension was rank-reduced. It must be in-bounds after folding. + expandedInBounds.push_back(true); + continue; + } + + // Not a rank-reduced dim: take in_bounds from the xfer op. + assert(idx < inBounds.size() && "invalid rank"); + expandedInBounds.push_back(inBounds[idx++]); + } + assert(idx == inBounds.size() && "invalid rank"); + return b.getBoolArrayAttr(expandedInBounds); +} + LogicalResult TransferReadOfExtractSliceOpFolder::matchAndRewrite( vector::TransferReadOp readOp, PatternRewriter &rewriter) const { auto extractSliceOp = @@ -105,13 +131,16 @@ extractSliceOp.getMixedStrides(), extractSliceOp.getDroppedDims(), indices, sourceIndices); + int64_t expandedRank = extractSliceOp.getSourceType().getRank(); rewriter.replaceOpWithNewOp( readOp, readOp.getVectorType(), extractSliceOp.getSource(), sourceIndices, - AffineMapAttr::get(expandDimsToRank( - readOp.getPermutationMap(), extractSliceOp.getSourceType().getRank(), - extractSliceOp.getDroppedDims())), + AffineMapAttr::get(expandDimsToRank(readOp.getPermutationMap(), + expandedRank, + extractSliceOp.getDroppedDims())), readOp.getPadding(), - /*mask=*/Value(), readOp.getInBoundsAttr()); + /*mask=*/Value(), + expandInBoundsToRank(rewriter, readOp, expandedRank, + extractSliceOp.getDroppedDims())); return success(); } @@ -137,12 +166,14 @@ insertSliceOp.getMixedStrides(), insertSliceOp.getDroppedDims(), indices, sourceIndices); + int64_t expandedRank = insertSliceOp.getDestType().getRank(); rewriter.replaceOpWithNewOp( insertSliceOp, writeOp.getValue(), insertSliceOp.getDest(), sourceIndices, AffineMapAttr::get(expandDimsToRank(writeOp.getPermutationMap(), - insertSliceOp.getDestType().getRank(), + expandedRank, insertSliceOp.getDroppedDims())), - writeOp.getInBoundsAttr()); + expandInBoundsToRank(rewriter, writeOp, expandedRank, + insertSliceOp.getDroppedDims())); return success(); } diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -3464,17 +3464,28 @@ << ") don't match"; if (inBounds) { - if (permutationMap.getNumResults() != static_cast(inBounds.size())) + if (shapedType.getRank() != static_cast(inBounds.size())) return op->emitOpError("expects the optional in_bounds attr of same rank " - "as permutation_map results: ") - << AffineMapAttr::get(permutationMap) + "as the source type: ") + << shapedType.getRank() << " vs inBounds of size: " << inBounds.size(); - for (unsigned int i = 0; i < permutationMap.getNumResults(); ++i) - if (permutationMap.getResult(i).isa() && - !llvm::cast(inBounds.getValue()[i]).getValue()) - return op->emitOpError("requires broadcast dimensions to be in-bounds"); } + // Make sure that all non-transfer dimensions are in-bounds. + SmallVector inBoundsVals(op.getShapedType().getRank(), false); + if (inBounds) + inBoundsVals = extractFromIntegerArrayAttr(inBounds); + DenseSet xferDims; + for (AffineExpr expr : permutationMap.getResults()) { + if (auto dimExpr = expr.template dyn_cast()) + xferDims.insert(dimExpr.getPosition()); + } + for (int64_t i = 0, e = op.getShapedType().getRank(); i < e; ++i) + if (!xferDims.contains(i) && !op.isDimInBounds(i)) { + return op->emitOpError( + "expects that all non-transfer dims are in-bounds"); + } + return success(); } @@ -3654,32 +3665,42 @@ template static LogicalResult foldTransferInBoundsAttribute(TransferOp op) { - // TODO: support 0-d corner case. - // TODO: Be less conservative. - if (op.getTransferRank() == 0) - return failure(); - AffineMap permutationMap = op.getPermutationMap(); + SmallVector accessedChunk = op.getTransferChunkAccessed(); + + // Prepare new in_bounds values. bool changed = false; - SmallVector newInBounds; - newInBounds.reserve(op.getTransferRank()); - for (unsigned i = 0; i < op.getTransferRank(); ++i) { + SmallVector newInBounds(op.getShapedType().getRank(), false); + if (op.getInBounds().has_value()) + newInBounds = extractFromIntegerArrayAttr(op.getInBoundsAttr()); + + for (unsigned i = 0; i < op.getShapedType().getRank(); ++i) { // Already marked as in-bounds, nothing to see here. - if (op.isDimInBounds(i)) { - newInBounds.push_back(true); + if (newInBounds[i]) continue; - } - // Currently out-of-bounds, check whether we can statically determine it is - // inBounds. - auto dimExpr = permutationMap.getResult(i).dyn_cast(); - assert(dimExpr && "Broadcast dims must be in-bounds"); - auto inBounds = - isInBounds(op, /*resultIdx=*/i, /*indicesIdx=*/dimExpr.getPosition()); - newInBounds.push_back(inBounds); - // We commit the pattern if it is "more inbounds". + + // Cannot infer in_bounds for dynamic dimensions. + if (op.getShapedType().isDynamicDim(i)) + continue; + int64_t sourceSize = op.getShapedType().getDimSize(i); + + // Cannot infer in_bounds for non-constant indices. + Value index = op.getIndices()[i]; + std::optional constantOffset = getConstantIntValue(index); + if (!constantOffset.has_value()) + continue; + + // If the dimension is not part of the transfer shape, it's like a "1" + // dimensions that is casted away. + int64_t vectorSize = accessedChunk[i] == 0 ? 1 : accessedChunk[i]; + + bool inBounds = constantOffset.value() + vectorSize <= sourceSize; + newInBounds[i] = inBounds; changed |= inBounds; } + if (!changed) return failure(); + // OpBuilder is only used as a helper to build an I64ArrayAttr. OpBuilder b(op.getContext()); op->setAttr(TransferOp::getInBoundsAttrStrName(), @@ -3747,11 +3768,11 @@ /// Example: /// ``` /// %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0] -/// {in_bounds = [true, true], +/// {in_bounds = [true, true, true], /// permutation_map = affine_map<(d0, d1, d2) -> (d2, d1)>} : /// vector<4x1xf32>, tensor<4x4x4xf32> /// %r = vector.transfer_read %w0[%c0, %c0, %c0], %cf0 -/// {in_bounds = [true, true, true, true], +/// {in_bounds = [true, true, true], /// permutation_map = affine_map<(d0, d1, d2) -> (d1, 0, d2, 0)>} : /// tensor<4x4x4xf32>, vector<1x100x4x5xf32> /// ``` diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp @@ -20,19 +20,6 @@ using namespace mlir; using namespace mlir::vector; -/// Transpose a vector transfer op's `in_bounds` attribute by applying reverse -/// permutation based on the given indices. -static ArrayAttr -inverseTransposeInBoundsAttr(OpBuilder &builder, ArrayAttr attr, - const SmallVector &permutation) { - SmallVector newInBoundsValues(permutation.size()); - size_t index = 0; - for (unsigned pos : permutation) - newInBoundsValues[pos] = - cast(attr.getValue()[index++]).getValue(); - return builder.getBoolArrayAttr(newInBoundsValues); -} - /// Extend the rank of a vector Value by `addedRanks` by adding outer unit /// dimensions. static Value extendVectorRank(OpBuilder &builder, Location loc, Value vec, @@ -119,19 +106,13 @@ newVectorShape[pos.value()] = originalShape[pos.index()]; } - // Transpose in_bounds attribute. - ArrayAttr newInBoundsAttr = - op.getInBounds() ? inverseTransposeInBoundsAttr( - rewriter, op.getInBounds().value(), permutation) - : ArrayAttr(); - // Generate new transfer_read operation. VectorType newReadType = VectorType::get(newVectorShape, op.getVectorType().getElementType()); Value newRead = rewriter.create( op.getLoc(), newReadType, op.getSource(), op.getIndices(), AffineMapAttr::get(newMap), op.getPadding(), op.getMask(), - newInBoundsAttr); + op.getInBounds() ? *op.getInBounds() : ArrayAttr()); // Transpose result of transfer_read. SmallVector transposePerm(permutation.begin(), permutation.end()); @@ -189,12 +170,6 @@ return expr.dyn_cast().getPosition(); }); - // Transpose in_bounds attribute. - ArrayAttr newInBoundsAttr = - op.getInBounds() ? inverseTransposeInBoundsAttr( - rewriter, op.getInBounds().value(), permutation) - : ArrayAttr(); - // Generate new transfer_write operation. Value newVec = rewriter.create( op.getLoc(), op.getVector(), indices); @@ -202,7 +177,7 @@ map.getNumDims(), map.getNumResults(), rewriter.getContext()); rewriter.replaceOpWithNewOp( op, newVec, op.getSource(), op.getIndices(), AffineMapAttr::get(newMap), - op.getMask(), newInBoundsAttr); + op.getMask(), op.getInBounds() ? *op.getInBounds() : ArrayAttr()); return success(); } @@ -272,15 +247,9 @@ exprs.append(map.getResults().begin(), map.getResults().end()); AffineMap newMap = AffineMap::get(map.getNumDims(), 0, exprs, op.getContext()); - // All the new dimensions added are inbound. - SmallVector newInBoundsValues(missingInnerDim.size(), true); - for (int64_t i = 0, e = op.getVectorType().getRank(); i < e; ++i) { - newInBoundsValues.push_back(op.isDimInBounds(i)); - } - ArrayAttr newInBoundsAttr = rewriter.getBoolArrayAttr(newInBoundsValues); rewriter.replaceOpWithNewOp( op, newVec, op.getSource(), op.getIndices(), AffineMapAttr::get(newMap), - newMask, newInBoundsAttr); + newMask, op.getInBoundsAttr()); return success(); } }; @@ -353,15 +322,10 @@ VectorType newReadType = VectorType::get(newShape, originalVecType.getElementType()); - ArrayAttr newInBoundsAttr = - op.getInBounds() - ? rewriter.getArrayAttr( - op.getInBoundsAttr().getValue().take_back(reducedShapeRank)) - : ArrayAttr(); Value newRead = rewriter.create( op.getLoc(), newReadType, op.getSource(), op.getIndices(), AffineMapAttr::get(newMap), op.getPadding(), op.getMask(), - newInBoundsAttr); + op.getInBoundsAttr()); rewriter.replaceOpWithNewOp(op, originalVecType, newRead); return success(); diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp --- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp @@ -216,15 +216,10 @@ AffineMap::get(oldMap.getNumDims(), oldMap.getNumSymbols(), newResults, rewriter.getContext()); - ArrayAttr inBoundsAttr; - if (read.getInBounds()) - inBoundsAttr = rewriter.getArrayAttr( - read.getInBoundsAttr().getValue().take_back(newType.getRank())); - auto newRead = rewriter.create( read.getLoc(), newType, read.getSource(), read.getIndices(), AffineMapAttr::get(newMap), read.getPadding(), /*mask=*/Value(), - inBoundsAttr); + read.getInBoundsAttr()); rewriter.replaceOpWithNewOp(read, oldType, newRead); return success(); @@ -264,16 +259,11 @@ AffineMap::get(oldMap.getNumDims(), oldMap.getNumSymbols(), newResults, rewriter.getContext()); - ArrayAttr inBoundsAttr; - if (write.getInBounds()) - inBoundsAttr = rewriter.getArrayAttr( - write.getInBoundsAttr().getValue().take_back(newType.getRank())); - auto newVector = rewriter.create( write.getLoc(), write.getVector(), splatZero(dropDim)); rewriter.replaceOpWithNewOp( write, newVector, write.getSource(), write.getIndices(), - AffineMapAttr::get(newMap), inBoundsAttr); + AffineMapAttr::get(newMap), write.getInBoundsAttr()); return success(); } diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp @@ -523,7 +523,8 @@ vectorType.getElementType()); vector::TransferReadOp flatRead = rewriter.create( loc, flatVectorType, collapsedSource, collapsedIndices, collapsedMap); - flatRead.setInBoundsAttr(rewriter.getBoolArrayAttr({true})); + SmallVector newInBounds(collapsedIndices.size(), true); + flatRead.setInBoundsAttr(rewriter.getBoolArrayAttr(newInBounds)); rewriter.replaceOpWithNewOp( transferReadOp, cast(vector.getType()), flatRead); return success(); @@ -586,7 +587,8 @@ vector::TransferWriteOp flatWrite = rewriter.create( loc, flatVector, collapsedSource, collapsedIndices, collapsedMap); - flatWrite.setInBoundsAttr(rewriter.getBoolArrayAttr({true})); + SmallVector newInBounds(collapsedIndices.size(), true); + flatWrite.setInBoundsAttr(rewriter.getBoolArrayAttr(newInBounds)); rewriter.eraseOp(transferWriteOp); return success(); } @@ -744,6 +746,9 @@ // Map not supported. if (!xferOp.getPermutationMap().isMinorIdentity()) return failure(); + // Cannot rewrite out-of-bounds transfers. + if (xferOp.hasOutOfBoundsDim()) + return failure(); // Only float and integer element types are supported. Value scalar; if (vecType.getRank() == 0) { diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp @@ -40,39 +40,48 @@ /// Build the condition to ensure that a particular VectorTransferOpInterface /// is in-bounds. -static Value createInBoundsCond(RewriterBase &b, - VectorTransferOpInterface xferOp) { +static Value createInBoundsCondition(RewriterBase &b, + VectorTransferOpInterface xferOp) { assert(xferOp.permutation_map().isMinorIdentity() && "Expected minor identity map"); + SmallVector accessedChunk = xferOp.getTransferChunkAccessed(); Value inBoundsCond; - xferOp.zipResultAndIndexing([&](int64_t resultIdx, int64_t indicesIdx) { - // Zip over the resulting vector shape and memref indices. + + for (int64_t dim = 0, e = xferOp.getShapedType().getRank(); dim < e; ++dim) { // If the dimension is known to be in-bounds, it does not participate in // the construction of `inBoundsCond`. - if (xferOp.isDimInBounds(resultIdx)) - return; + if (xferOp.isDimInBounds(dim)) + continue; + // Fold or create the check that `index + vector_size` <= `memref_size`. Location loc = xferOp.getLoc(); - int64_t vectorSize = xferOp.getVectorType().getDimSize(resultIdx); + // If the dimension is not part of the transfer shape, it's like a "1" + // dimensions that is casted away. + int64_t vectorSize = accessedChunk[dim] == 0 ? 1 : accessedChunk[dim]; OpFoldResult sum = affine::makeComposedFoldedAffineApply( b, loc, b.getAffineDimExpr(0) + b.getAffineConstantExpr(vectorSize), - {xferOp.indices()[indicesIdx]}); - OpFoldResult dimSz = - memref::getMixedSize(b, loc, xferOp.source(), indicesIdx); + {xferOp.indices()[dim]}); + OpFoldResult dimSz = memref::getMixedSize(b, loc, xferOp.source(), dim); + + // Skip check if it is statically "true"; auto maybeCstSum = getConstantIntValue(sum); auto maybeCstDimSz = getConstantIntValue(dimSz); if (maybeCstSum && maybeCstDimSz && *maybeCstSum <= *maybeCstDimSz) - return; + continue; + + // Generate condition. Value cond = b.create(loc, arith::CmpIPredicate::sle, getValueOrCreateConstantIndexOp(b, loc, sum), getValueOrCreateConstantIndexOp(b, loc, dimSz)); + // Conjunction over all dims for which we are in-bounds. if (inBoundsCond) inBoundsCond = b.create(loc, inBoundsCond, cond); else inBoundsCond = cond; - }); + } + return inBoundsCond; } @@ -100,7 +109,7 @@ /// memref.cast %alloc: memref to compatibleMemRefType /// scf.yield %4 : compatibleMemRefType, index, index // } -/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]} +/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]} /// ``` /// where `alloc` is a top of the function alloca'ed buffer of one vector. /// @@ -481,7 +490,7 @@ /// memref.cast %alloc: memref to compatibleMemRefType /// scf.yield %4 : compatibleMemRefType, index, index // } -/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]} +/// %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true, true]} /// ``` /// where `alloc` is a top of the function alloca'ed buffer of one vector. /// @@ -503,7 +512,7 @@ /// memref.cast %alloc: memref to compatibleMemRefType /// scf.yield %4 : compatibleMemRefType, index, index /// } -/// %0 = vector.transfer_write %arg, %1#0[%1#1, %1#2] {in_bounds = [true ... +/// %0 = vector.transfer_write %arg, %1#0[%1#1, %1#2] {in_bounds = [true, /// true]} /// scf.if (%notInBounds) { /// // slowpath: not in-bounds vector.transfer or linalg.copy. @@ -522,7 +531,7 @@ if (options.vectorTransferSplit == VectorTransferSplit::None) return failure(); - SmallVector bools(xferOp.getTransferRank(), true); + SmallVector bools(xferOp.getShapedType().getRank(), true); auto inBoundsAttr = b.getBoolArrayAttr(bools); if (options.vectorTransferSplit == VectorTransferSplit::ForceInBounds) { b.updateRootInPlace(xferOp, [&]() { @@ -550,7 +559,7 @@ RewriterBase::InsertionGuard guard(b); b.setInsertionPoint(xferOp); - Value inBoundsCond = createInBoundsCond( + Value inBoundsCond = createInBoundsCondition( b, cast(xferOp.getOperation())); if (!inBoundsCond) return failure(); diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -992,6 +992,8 @@ bounds); } +/// Materialize a mask for a 1D vector transfer, where the transfer dimension +/// is out-of-bounds. template struct MaterializeTransferMask : public OpRewritePattern { public: @@ -1002,10 +1004,20 @@ LogicalResult matchAndRewrite(ConcreteOp xferOp, PatternRewriter &rewriter) const override { - if (!xferOp.hasOutOfBoundsDim()) + if (xferOp.getVectorType().getRank() > 1 || xferOp.getIndices().empty()) return failure(); - if (xferOp.getVectorType().getRank() > 1 || xferOp.getIndices().empty()) + // Get transfer dimension. + assert(xferOp.getPermutationMap().getNumResults() == 1 && + "expected one result in AffineMap"); + AffineDimExpr dimExpr = xferOp.getPermutationMap() + .getResult(0) + .template dyn_cast(); + assert(dimExpr && "expected dimension, not broadcast"); + int64_t transferDim = dimExpr.getPosition(); + + // The dimension must be out-of-bounds. + if (xferOp.isDimInBounds(transferDim)) return failure(); Location loc = xferOp->getLoc(); @@ -1031,9 +1043,17 @@ mask = rewriter.create(loc, mask, xferOp.getMask()); } + // The masked dimension is now in-bounds. + SmallVector inBounds(xferOp.getShapedType().getRank(), false); + if (xferOp.getInBounds().has_value()) + inBounds = extractFromIntegerArrayAttr(xferOp.getInBoundsAttr()); + assert(!inBounds[transferDim] && + "expected that dimension was out-of-bounds"); + inBounds[transferDim] = true; + rewriter.updateRootInPlace(xferOp, [&]() { xferOp.getMaskMutable().assign(mask); - xferOp.setInBoundsAttr(rewriter.getBoolArrayAttr({true})); + xferOp.setInBoundsAttr(rewriter.getBoolArrayAttr(inBounds)); }); return success(); diff --git a/mlir/test/Conversion/GPUCommon/transfer_write.mlir b/mlir/test/Conversion/GPUCommon/transfer_write.mlir --- a/mlir/test/Conversion/GPUCommon/transfer_write.mlir +++ b/mlir/test/Conversion/GPUCommon/transfer_write.mlir @@ -7,7 +7,7 @@ // CHECK:%[[base:[0-9]+]] = llvm.extractvalue // CHECK:%[[ptr:[0-9]+]] = llvm.getelementptr %[[base]] // CHECK:llvm.store %[[val]], %[[ptr]] - vector.transfer_write %arg3, %arg1[%c0, %c0] {in_bounds = [true]} : vector<1xf32>, memref<1024x1024xf32> + vector.transfer_write %arg3, %arg1[%c0, %c0] {in_bounds = [true, true]} : vector<1xf32>, memref<1024x1024xf32> } return } diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir --- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir +++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir @@ -276,17 +276,17 @@ // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]] // CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[c0]], [[m_coord]], [[c0]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} - %A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space>, vector<16x16xf16> + %A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space>, vector<16x16xf16> // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]] // CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true} - %B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space>, vector<16x16xf16> + %B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space>, vector<16x16xf16> // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] // CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false} - %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x32x40xf16, #gpu.address_space>, vector<16x16xf16> + %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x32x40xf16, #gpu.address_space>, vector<16x16xf16> // CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> // CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> @@ -294,7 +294,7 @@ %B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> %C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16> %D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> - vector.transfer_write %D0, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<1x32x40xf16, #gpu.address_space> + vector.transfer_write %D0, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x8xf16>, memref<1x32x40xf16, #gpu.address_space> return } @@ -319,19 +319,19 @@ // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]] // CHECK: nvgpu.ldmatrix %arg0[[[C0]], [[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space> -> vector<4x2xf16> - %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space>, vector<16x16xf16> + %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x20x20xf16, #gpu.address_space>, vector<16x16xf16> // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]] // CHECK: nvgpu.ldmatrix %arg1[[[C0]], [[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16, #gpu.address_space> -> vector<2x2xf16> - %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space>, vector<8x16xf16> + %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true, true]} : memref<2x20x20xf16, #gpu.address_space>, vector<8x16xf16> // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] // CHECK: nvgpu.ldmatrix %arg2[[[C0]], [[m_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space> -> vector<2x2xf16> - %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space>, vector<16x8xf16> + %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x20x20xf16, #gpu.address_space>, vector<16x8xf16> %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> - vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16, #gpu.address_space> + vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x8xf16>, memref<2x20x20xf16, #gpu.address_space> return } diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir --- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir +++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir @@ -172,7 +172,7 @@ %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %A, %B, %cst_0 : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16> %E = vector.transfer_read %arg3[%c0, %c0, %c0, %c0], %cst - {in_bounds = [true, true], permutation_map = affine_map<(d0, d1, d2, d3)->(0, d3)>} + {in_bounds = [true, true, true, true], permutation_map = affine_map<(d0, d1, d2, d3)->(0, d3)>} : memref<16x16x16x16xf16>, vector<16x16xf16> %F = arith.divf %D, %E : vector<16x16xf16> vector.transfer_write %F, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16> @@ -199,11 +199,11 @@ %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16> %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16> - %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true, true]} : memref<16xf16>, vector<16x16xf16> - %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16> + %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16>, vector<16x16xf16> + %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true]} : memref<16xf16>, vector<16x16xf16> + %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16>, vector<16x16xf16> %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16> - vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16> + vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x16xf16>, memref<2x16x16xf16> return } @@ -227,11 +227,11 @@ %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16> %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, vector<16x16xf16> - %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true, true]} : memref<16xf16>, vector<16x16xf16> - %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16> + %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, vector<16x16xf16> + %B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true]} : memref<16xf16>, vector<16x16xf16> + %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<2x16x16xf16>, vector<16x16xf16> %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16> - vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16> + vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x16xf16>, memref<2x16x16xf16> return } @@ -281,8 +281,8 @@ %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16> %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %A = vector.transfer_read %arg0[%c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16> - %B = vector.transfer_read %arg1[%c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16> + %A = vector.transfer_read %arg0[%c0], %cst {in_bounds = [true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16> + %B = vector.transfer_read %arg1[%c0], %cst {in_bounds = [true], permutation_map = affine_map<(d0) -> (d0, 0)>} : memref<16xf16>, vector<16x16xf16> %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16> vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16> diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -1716,7 +1716,8 @@ func.func @transfer_read_2d_to_1d(%A : memref, %base0: index, %base1: index) -> vector<17xf32> { %f7 = arith.constant 7.0: f32 %f = vector.transfer_read %A[%base0, %base1], %f7 - {permutation_map = affine_map<(d0, d1) -> (d1)>} : + {permutation_map = affine_map<(d0, d1) -> (d1)>, + in_bounds = [true, false]} : memref, vector<17xf32> return %f: vector<17xf32> } diff --git a/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir b/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir --- a/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir +++ b/mlir/test/Conversion/VectorToSCF/tensor-transfer-ops.mlir @@ -4,7 +4,7 @@ // CHECK: %[[ALLOC:.*]] = memref.alloca() : memref> // CHECK: %[[CASTED:.*]] = vector.type_cast %[[ALLOC]] : memref> to memref<4xvector<9xf32>> // CHECK: scf.for {{.*}} { -// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %cst {in_bounds = [true]} : tensor, vector<9xf32> +// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %cst {in_bounds = [true, true]} : tensor, vector<9xf32> // CHECK: memref.store %[[READ]], %[[CASTED]][%{{.*}}] : memref<4xvector<9xf32>> // CHECK: } // CHECK: %[[LOADED:.*]] = memref.load %[[ALLOC]][] : memref> @@ -25,7 +25,7 @@ // CHECK: %[[CASTED:.*]] = vector.type_cast %[[ALLOC]] : memref> to memref<2xvector<3xf32>> // CHECK: %[[RESULT:.*]] = scf.for {{.*}} iter_args(%[[STATE:.*]] = %{{.*}}) -> (tensor) { // CHECK: %[[LOADED:.*]] = memref.load %[[CASTED]][%{{.*}}] : memref<2xvector<3xf32>> -// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[LOADED]], %[[STATE]][{{.*}}] {in_bounds = [true]} : vector<3xf32>, tensor +// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[LOADED]], %[[STATE]][{{.*}}] {in_bounds = [true, true]} : vector<3xf32>, tensor // CHECK: scf.yield %[[WRITE]] : tensor // CHECK: } // CHECK: return %[[RESULT]] : tensor diff --git a/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir b/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir --- a/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir +++ b/mlir/test/Conversion/VectorToSCF/unrolled-tensor-transfer-ops.mlir @@ -2,13 +2,13 @@ // CHECK-LABEL: func @transfer_read_2d( // CHECK: %[[V_INIT:.*]] = arith.constant dense<-4.200000e+01> : vector<4x9xf32> -// CHECK: %[[V0:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor, vector<9xf32> +// CHECK: %[[V0:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor, vector<9xf32> // CHECK: %[[I0:.*]] = vector.insert %[[V0]], %[[V_INIT]] [0] : vector<9xf32> into vector<4x9xf32> -// CHECK: %[[V1:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor, vector<9xf32> +// CHECK: %[[V1:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor, vector<9xf32> // CHECK: %[[I1:.*]] = vector.insert %[[V1]], %[[I0]] [1] : vector<9xf32> into vector<4x9xf32> -// CHECK: %[[V2:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor, vector<9xf32> +// CHECK: %[[V2:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor, vector<9xf32> // CHECK: %[[I2:.*]] = vector.insert %[[V2]], %[[I1]] [2] : vector<9xf32> into vector<4x9xf32> -// CHECK: %[[V3:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true]} : tensor, vector<9xf32> +// CHECK: %[[V3:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, true]} : tensor, vector<9xf32> // CHECK: %[[I3:.*]] = vector.insert %[[V3]], %[[I2]] [3] : vector<9xf32> into vector<4x9xf32> // CHECK: return %[[I3]] : vector<4x9xf32> func.func @transfer_read_2d(%A : tensor, %base1 : index, %base2 : index) @@ -23,9 +23,9 @@ // CHECK-LABEL: func @transfer_write_2d( // CHECK: %[[V0:.*]] = vector.extract %{{.*}}[0] : vector<2x3xf32> -// CHECK: %[[T0:.*]] = vector.transfer_write %[[V0]], %{{.*}}[{{.*}}] {in_bounds = [true]} : vector<3xf32>, tensor +// CHECK: %[[T0:.*]] = vector.transfer_write %[[V0]], %{{.*}}[{{.*}}] {in_bounds = [true, true]} : vector<3xf32>, tensor // CHECK: %[[V1:.*]] = vector.extract %{{.*}}[1] : vector<2x3xf32> -// CHECK: %[[T1:.*]] = vector.transfer_write %[[V1]], %[[T0]][{{.*}}] {in_bounds = [true]} : vector<3xf32>, tensor +// CHECK: %[[T1:.*]] = vector.transfer_write %[[V1]], %[[T0]][{{.*}}] {in_bounds = [true, true]} : vector<3xf32>, tensor // CHECK: return %[[T1]] : tensor func.func @transfer_write_2d(%A : tensor, %vec : vector<2x3xf32>, %base1 : index, %base2 : index) -> (tensor) { diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir --- a/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir +++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf-mask-and-permutation-map.mlir @@ -12,7 +12,7 @@ // CHECK: scf.for {{.*}} { // CHECK: scf.if {{.*}} { // CHECK: %[[MASK_LOADED:.*]] = memref.load %[[MASK_CASTED]][%{{.*}}] : memref<4xvector<9xi1>> -// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}, %{{.*}}, %[[MASK_LOADED]] : memref, vector<9xf32> +// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}, %{{.*}}, %[[MASK_LOADED]] {in_bounds = [true, false]} : memref, vector<9xf32> // CHECK: memref.store %[[READ]], %{{.*}} : memref<4xvector<9xf32>> // CHECK: } // CHECK: } diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir --- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir +++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir @@ -26,13 +26,13 @@ %A = memref.alloc () : memref<7x42xf32> affine.for %i0 = 0 to 7 step 4 { affine.for %i1 = 0 to 42 step 4 { - %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> + %f1 = vector.transfer_read %A[%i0, %i1], %f0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1) - %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> + %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {in_bounds = [false, true],permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1) - %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> + %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1) - %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> + %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds. // CHECK: scf.if // CHECK-NEXT: memref.load @@ -59,9 +59,9 @@ affine.for %i2 = 0 to %dyn2 { affine.for %i3 = 0 to 42 step 2 { affine.for %i4 = 0 to %dyn4 { - %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> + %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {in_bounds = [true, true, true, false, true], permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3) - %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> + %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {in_bounds = [true, true, true, false, true], permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> // Add a dummy use to prevent dead code elimination from removing // transfer read ops. "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> () @@ -132,7 +132,7 @@ affine.for %i1 = 0 to %N { affine.for %i2 = 0 to %O { affine.for %i3 = 0 to %P step 5 { - %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref, vector<5x4x3xf32> + %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {in_bounds = [false, true, true, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref, vector<5x4x3xf32> // Add a dummy use to prevent dead code elimination from removing // transfer read ops. "dummy_use"(%f) : (vector<5x4x3xf32>) -> () @@ -173,7 +173,7 @@ // CHECK: scf.for %[[I6:.*]] = %[[C0]] to %[[C1]] step %[[C1]] { // CHECK: %[[S0:.*]] = affine.apply #[[$ADD]](%[[I2]], %[[I6]]) // CHECK: %[[VEC:.*]] = memref.load %[[VECTOR_VIEW3]][%[[I4]], %[[I5]], %[[I6]]] : memref<3x4x1xvector<5xf32>> - // CHECK: vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] : vector<5xf32>, memref + // CHECK: vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] {in_bounds = [true, true, true, false]} : vector<5xf32>, memref // CHECK: } // CHECK: } // CHECK: } @@ -195,7 +195,7 @@ affine.for %i1 = 0 to %N step 4 { affine.for %i2 = 0 to %O { affine.for %i3 = 0 to %P step 5 { - vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref + vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {in_bounds = [false, false, true, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref } } } @@ -233,7 +233,7 @@ // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] // CHECK: %[[cond1:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index // CHECK: scf.if %[[cond1]] { - // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> + // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref, vector<15xf32> // CHECK: memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> // CHECK: } else { // CHECK: store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> @@ -247,7 +247,7 @@ // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -256,7 +256,7 @@ // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]] // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -265,7 +265,7 @@ // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]] // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] {in_bounds = [true, false]} : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -306,7 +306,7 @@ // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index // CHECK: scf.if %[[cmp]] { // CHECK: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK: vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref + // CHECK: vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref // CHECK: } // CHECK: } @@ -315,19 +315,19 @@ // FULL-UNROLL: %[[CMP0:.*]] = arith.cmpi sgt, %[[DIM]], %[[base]] : index // FULL-UNROLL: scf.if %[[CMP0]] { // FULL-UNROLL: %[[V0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref // FULL-UNROLL: } // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]] // FULL-UNROLL: %[[CMP1:.*]] = arith.cmpi sgt, %{{.*}}, %[[I1]] : index // FULL-UNROLL: scf.if %[[CMP1]] { // FULL-UNROLL: %[[V1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref // FULL-UNROLL: } // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]] // FULL-UNROLL: %[[CMP2:.*]] = arith.cmpi sgt, %{{.*}}, %[[I2]] : index // FULL-UNROLL: scf.if %[[CMP2]] { // FULL-UNROLL: %[[V2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] {in_bounds = [true, false]} : vector<15xf32>, memref // FULL-UNROLL: } vector.transfer_write %vec, %A[%base, %base] : @@ -360,16 +360,16 @@ // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] // CHECK-NEXT: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] // CHECK-NEXT: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK-NEXT: vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref + // CHECK-NEXT: vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]] // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]] // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true, true]} : vector<15xf32>, memref vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} : vector<3x15xf32>, memref return @@ -396,7 +396,8 @@ %c0 = arith.constant 0 : index %f0 = arith.constant 0.0 : f32 %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0 - { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> } + { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>, + in_bounds = [true, true, false, false] } : memref, vector<3x3xf32> return %0 : vector<3x3xf32> } @@ -415,7 +416,7 @@ // CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg1]] : index // CHECK: scf.if %[[cmp]] { -// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref, vector<3xf32> +// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] {in_bounds = [true, true, true, false]} : memref, vector<3xf32> // CHECK: memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> // CHECK: } else { // CHECK: memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> @@ -428,7 +429,8 @@ %c0 = arith.constant 0 : index %f0 = arith.constant 0.0 : f32 vector.transfer_write %A, %B[%c0, %c0, %c0, %c0] - { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> } + { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>, + in_bounds = [true, true, false, false] } : vector<3x3xf32>, memref return } @@ -448,7 +450,7 @@ // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg2]] : index // CHECK: scf.if %[[cmp]] { // CHECK: %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>> -// CHECK: vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref +// CHECK: vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] {in_bounds = [true, true, true, false]} : vector<3xf32>, memref // CHECK: } // CHECK: } // CHECK: return @@ -459,7 +461,7 @@ func.func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> { %c0 = arith.constant 0 : index %f0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %A[%c0, %c0], %f0 + %0 = vector.transfer_read %A[%c0, %c0], %f0 {in_bounds = [true, false]} : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32> return %0 : vector<4xf32> } @@ -470,7 +472,7 @@ func.func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) { %c0 = arith.constant 0 : index - vector.transfer_write %A, %B[%c0, %c0] : + vector.transfer_write %A, %B[%c0, %c0] {in_bounds = [true, false]} : vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>> return } diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir @@ -56,7 +56,7 @@ // VECNEST: vector.transfer_read // VECNEST-NEXT: affine.for %{{.*}} = 0 to 30 { // VECNEST: vector.transfer_read -// VECNEST-NEXT: vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #{{.*}}} +// VECNEST-NEXT: vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] {in_bounds = [false, true], permutation_map = #{{.*}}} // VECNEST-NEXT: } // VECNEST-NEXT: vector.transfer_write // VECNEST: } diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir @@ -22,7 +22,7 @@ // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 -// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> +// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector %a0 = affine.load %A[%c0, %c0] : memref } @@ -48,7 +48,7 @@ // CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 // CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32 -// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] : memref, vector<128xf32> +// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] {in_bounds = [true, false]} : memref, vector<128xf32> affine.for %i3 = 0 to %M { // vectorized %a3 = affine.load %A[%c0, %i3] : memref } @@ -77,7 +77,7 @@ // CHECK-NEXT: %[[APP9_0:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]]) // CHECK-NEXT: %[[APP9_1:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]]) // CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32 -// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref, vector<128xf32> +// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] {in_bounds = [true, false]} : memref, vector<128xf32> affine.for %i8 = 0 to %M { // vectorized affine.for %i9 = 0 to %N { %a9 = affine.load %A[%i9, %i8 + %i9] : memref @@ -98,7 +98,7 @@ affine.for %i0 = 0 to %M { affine.for %i1 = 0 to %N { // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32> - // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref + // CHECK: vector.transfer_write %[[C1]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref // non-scoped %f1 affine.store %f1, %A[%i0, %i1] : memref } @@ -106,7 +106,7 @@ affine.for %i2 = 0 to %M { affine.for %i3 = 0 to %N { // CHECK: %[[C3:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32> - // CHECK: vector.transfer_write %[[C3]], {{.*}} : vector<128xf32>, memref + // CHECK: vector.transfer_write %[[C3]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref // non-scoped %f2 affine.store %f2, %B[%i2, %i3] : memref } @@ -115,13 +115,13 @@ affine.for %i5 = 0 to %N { // CHECK: %[[SPLAT2:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32> // CHECK: %[[SPLAT1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32> - // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref, vector<128xf32> - // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref, vector<128xf32> + // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} {in_bounds = [true, false]} : memref, vector<128xf32> + // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} {in_bounds = [true, false]} : memref, vector<128xf32> // CHECK: %[[S5:.*]] = arith.addf %[[A5]], %[[B5]] : vector<128xf32> // CHECK: %[[S6:.*]] = arith.addf %[[S5]], %[[SPLAT1]] : vector<128xf32> // CHECK: %[[S7:.*]] = arith.addf %[[S5]], %[[SPLAT2]] : vector<128xf32> // CHECK: %[[S8:.*]] = arith.addf %[[S7]], %[[S6]] : vector<128xf32> - // CHECK: vector.transfer_write %[[S8]], {{.*}} : vector<128xf32>, memref + // CHECK: vector.transfer_write %[[S8]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref %a5 = affine.load %A[%i4, %i5] : memref %b5 = affine.load %B[%i4, %i5] : memref %s5 = arith.addf %a5, %b5 : f32 @@ -150,7 +150,7 @@ affine.for %i0 = 0 to %M { // vectorized // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32> // CHECK-NEXT: affine.for - // CHECK-NEXT: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref + // CHECK-NEXT: vector.transfer_write %[[C1]], {{.*}} {in_bounds = [true, false]} : vector<128xf32>, memref affine.for %i1 = 0 to %N { affine.store %f1, %A[%i1, %i0] : memref } @@ -171,7 +171,7 @@ // CHECK-NEXT: affine.for %[[IV1:[0-9a-zA-Z_]+]] = 0 to 32 { // CHECK-NEXT: %[[BROADCAST:.*]] = vector.broadcast %[[IV1]] : index to vector<128xindex> // CHECK-NEXT: %[[CAST:.*]] = arith.index_cast %[[BROADCAST]] : vector<128xindex> to vector<128xi32> - // CHECK-NEXT: vector.transfer_write %[[CAST]], {{.*}}[%[[IV1]], %[[IV0]]] : vector<128xi32>, memref<32x512xi32> + // CHECK-NEXT: vector.transfer_write %[[CAST]], {{.*}}[%[[IV1]], %[[IV0]]] {in_bounds = [true, false]} : vector<128xi32>, memref<32x512xi32> affine.for %i = 0 to 512 { // vectorized affine.for %j = 0 to 32 { %idx = arith.index_cast %j : index to i32 @@ -281,7 +281,7 @@ // CHECK:for [[IV4:%[0-9a-zA-Z_]+]] = 0 to [[ARG_M]] step 128 { // CHECK-NEXT: for [[IV5:%[0-9a-zA-Z_]*]] = 0 to [[ARG_N]] { // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 -// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} : memref, vector<128xf32> +// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} {in_bounds = [true, false]} : memref, vector<128xf32> affine.for %i4 = 0 to %M { // vectorized affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1 %a5 = affine.load %A[%i5, %i4] : memref @@ -425,7 +425,7 @@ // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK: %{{.*}} = arith.constant 0.0{{.*}}: f32 -// CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> +// CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}} affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector %a18 = affine.load %A[%c0, %c0] : memref @@ -459,7 +459,7 @@ // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 -// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> +// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}} affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector %a18 = affine.load %A[%c0, %c0] : memref diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir @@ -123,8 +123,8 @@ // VECT: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 { // VECT-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 { // VECT-NEXT: affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) { - // VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref, vector<4x8xf32> - // VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref, vector<4x8xf32> + // VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {in_bounds = [true, false], permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref, vector<4x8xf32> + // VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {in_bounds = [false, true], permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref, vector<4x8xf32> // VECT-NEXT: %[[C:.*]] = arith.mulf %[[B]], %[[A]] : vector<4x8xf32> // VECT: %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} : memref, vector<4x8xf32> // VECT-NEXT: %[[E:.*]] = arith.addf %[[D]], %[[C]] : vector<4x8xf32> diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir @@ -13,7 +13,7 @@ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 - // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d0d2]]} : memref, vector<32x256xf32> + // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d0d2]]} : memref, vector<32x256xf32> affine.for %i0 = 0 to %M { affine.for %i1 = 0 to %N { affine.for %i2 = 0 to %P { diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir @@ -25,7 +25,7 @@ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { - // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> + // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> affine.for %i3 = 0 to %M { affine.for %i4 = 0 to %N { affine.for %i5 = 0 to %P { @@ -46,12 +46,12 @@ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 { - // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> + // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { - // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> + // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { - // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> + // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [false, true, false], permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref, vector<32x256xf32> affine.for %i0 = 0 to %0 { affine.for %i1 = 0 to %1 { affine.for %i2 = 0 to %2 { diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir @@ -25,7 +25,7 @@ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 - // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> + // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> affine.for %i3 = 0 to %M { affine.for %i4 = 0 to %N { affine.for %i5 = 0 to %P { @@ -46,12 +46,12 @@ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { - // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> + // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> // CHECK: affine.for %{{.*}} = 0 to %{{.*}} { // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 { - // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> + // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 { - // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> + // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, false, false], permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref, vector<32x256xf32> affine.for %i0 = 0 to %0 { affine.for %i1 = 0 to %1 { affine.for %i2 = 0 to %2 { diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir --- a/mlir/test/Dialect/Linalg/hoisting.mlir +++ b/mlir/test/Dialect/Linalg/hoisting.mlir @@ -46,13 +46,13 @@ // CHECK: "unrelated_use"(%[[MEMREF1]]) : (memref) -> () scf.for %i = %lb to %ub step %step { scf.for %j = %lb to %ub step %step { - %r0 = vector.transfer_read %memref1[%c0, %c0], %cst: memref, vector<1xf32> - %r1 = vector.transfer_read %memref0[%i, %i], %cst: memref, vector<2xf32> - %r2 = vector.transfer_read %memref2[%c0, %c0], %cst: memref, vector<3xf32> - %r3 = vector.transfer_read %memref3[%c0, %c0], %cst: memref, vector<4xf32> + %r0 = vector.transfer_read %memref1[%c0, %c0], %cst {in_bounds = [true, false]} : memref, vector<1xf32> + %r1 = vector.transfer_read %memref0[%i, %i], %cst {in_bounds = [true, false]} : memref, vector<2xf32> + %r2 = vector.transfer_read %memref2[%c0, %c0], %cst {in_bounds = [true, false]} : memref, vector<3xf32> + %r3 = vector.transfer_read %memref3[%c0, %c0], %cst {in_bounds = [true, false]} : memref, vector<4xf32> "some_crippling_use"(%memref4) : (memref) -> () - %r4 = vector.transfer_read %memref4[%c0, %c0], %cst: memref, vector<5xf32> - %r5 = vector.transfer_read %memref5[%c0, %c0], %cst: memref, vector<6xf32> + %r4 = vector.transfer_read %memref4[%c0, %c0], %cst {in_bounds = [true, false]} : memref, vector<5xf32> + %r5 = vector.transfer_read %memref5[%c0, %c0], %cst {in_bounds = [true, false]} : memref, vector<6xf32> "some_crippling_use"(%memref5) : (memref) -> () %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32> %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32> @@ -60,12 +60,12 @@ %u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32> %u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32> %u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32> - vector.transfer_write %u0, %memref1[%c0, %c0] : vector<1xf32>, memref - vector.transfer_write %u1, %memref0[%i, %i] : vector<2xf32>, memref - vector.transfer_write %u2, %memref2[%c0, %c0] : vector<3xf32>, memref - vector.transfer_write %u3, %memref3[%c0, %c0] : vector<4xf32>, memref - vector.transfer_write %u4, %memref4[%c0, %c0] : vector<5xf32>, memref - vector.transfer_write %u5, %memref5[%c0, %c0] : vector<6xf32>, memref + vector.transfer_write %u0, %memref1[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, memref + vector.transfer_write %u1, %memref0[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, memref + vector.transfer_write %u2, %memref2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, memref + vector.transfer_write %u3, %memref3[%c0, %c0] {in_bounds = [true, false]} : vector<4xf32>, memref + vector.transfer_write %u4, %memref4[%c0, %c0] {in_bounds = [true, false]} : vector<5xf32>, memref + vector.transfer_write %u5, %memref5[%c0, %c0] {in_bounds = [true, false]} : vector<6xf32>, memref "some_crippling_use"(%memref3) : (memref) -> () } "unrelated_use"(%memref0) : (memref) -> () @@ -134,14 +134,14 @@ // CHECK: vector.transfer_write %{{.*}}, %[[MEMREF2]]{{.*}} : vector<3xf32>, memref scf.for %i = %lb to %ub step %step { scf.for %j = %lb to %ub step %step { - %r00 = vector.transfer_read %memref1[%c0, %c0], %cst: memref, vector<2xf32> - %r01 = vector.transfer_read %memref1[%c0, %c1], %cst: memref, vector<2xf32> - %r20 = vector.transfer_read %memref2[%c0, %c0], %cst: memref, vector<3xf32> - %r21 = vector.transfer_read %memref2[%c0, %c3], %cst: memref, vector<3xf32> - %r30 = vector.transfer_read %memref3[%c0, %random_index], %cst: memref, vector<4xf32> - %r31 = vector.transfer_read %memref3[%c1, %random_index], %cst: memref, vector<4xf32> - %r10 = vector.transfer_read %memref0[%i, %i], %cst: memref, vector<2xf32> - %r11 = vector.transfer_read %memref0[%random_index, %random_index], %cst: memref, vector<2xf32> + %r00 = vector.transfer_read %memref1[%c0, %c0], %cst {in_bounds = [true, false]} : memref, vector<2xf32> + %r01 = vector.transfer_read %memref1[%c0, %c1], %cst {in_bounds = [true, false]} : memref, vector<2xf32> + %r20 = vector.transfer_read %memref2[%c0, %c0], %cst {in_bounds = [true, false]} : memref, vector<3xf32> + %r21 = vector.transfer_read %memref2[%c0, %c3], %cst {in_bounds = [true, false]} : memref, vector<3xf32> + %r30 = vector.transfer_read %memref3[%c0, %random_index], %cst {in_bounds = [true, false]} : memref, vector<4xf32> + %r31 = vector.transfer_read %memref3[%c1, %random_index], %cst {in_bounds = [true, false]} : memref, vector<4xf32> + %r10 = vector.transfer_read %memref0[%i, %i], %cst {in_bounds = [true, false]} : memref, vector<2xf32> + %r11 = vector.transfer_read %memref0[%random_index, %random_index], %cst {in_bounds = [true, false]} : memref, vector<2xf32> %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32> %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32> %u20 = "some_use"(%r20) : (vector<3xf32>) -> vector<3xf32> @@ -150,14 +150,14 @@ %u31 = "some_use"(%r31) : (vector<4xf32>) -> vector<4xf32> %u10 = "some_use"(%r10) : (vector<2xf32>) -> vector<2xf32> %u11 = "some_use"(%r11) : (vector<2xf32>) -> vector<2xf32> - vector.transfer_write %u00, %memref1[%c0, %c0] : vector<2xf32>, memref - vector.transfer_write %u01, %memref1[%c0, %c1] : vector<2xf32>, memref - vector.transfer_write %u20, %memref2[%c0, %c0] : vector<3xf32>, memref - vector.transfer_write %u21, %memref2[%c0, %c3] : vector<3xf32>, memref - vector.transfer_write %u30, %memref3[%c0, %random_index] : vector<4xf32>, memref - vector.transfer_write %u31, %memref3[%c1, %random_index] : vector<4xf32>, memref - vector.transfer_write %u10, %memref0[%i, %i] : vector<2xf32>, memref - vector.transfer_write %u11, %memref0[%random_index, %random_index] : vector<2xf32>, memref + vector.transfer_write %u00, %memref1[%c0, %c0] {in_bounds = [true, false]} : vector<2xf32>, memref + vector.transfer_write %u01, %memref1[%c0, %c1] {in_bounds = [true, false]} : vector<2xf32>, memref + vector.transfer_write %u20, %memref2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, memref + vector.transfer_write %u21, %memref2[%c0, %c3] {in_bounds = [true, false]} : vector<3xf32>, memref + vector.transfer_write %u30, %memref3[%c0, %random_index] {in_bounds = [true, false]} : vector<4xf32>, memref + vector.transfer_write %u31, %memref3[%c1, %random_index] {in_bounds = [true, false]} : vector<4xf32>, memref + vector.transfer_write %u10, %memref0[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, memref + vector.transfer_write %u11, %memref0[%random_index, %random_index] {in_bounds = [true, false]} : vector<2xf32>, memref } } return @@ -180,7 +180,7 @@ // CHECK: %[[C0:.*]] = arith.constant 0 : i32 // CHECK: affine.for %[[I:.*]] = 0 to 64 { // CHECK: affine.for %[[J:.*]] = 0 to 64 step 16 { -// CHECK: %[[R0:.*]] = vector.transfer_read %[[MEMREF2]][%[[I]], %[[J]]], %[[C0]] : memref<64x64xi32>, vector<16xi32> +// CHECK: %[[R0:.*]] = vector.transfer_read %[[MEMREF2]][%[[I]], %[[J]]], %[[C0]] {{.*}} : memref<64x64xi32>, vector<16xi32> // CHECK: %[[R:.*]] = affine.for %[[K:.*]] = 0 to 64 iter_args(%[[ACC:.*]] = %[[R0]]) -> (vector<16xi32>) { // CHECK: %[[AV:.*]] = vector.transfer_read %[[MEMREF0]][%[[I]], %[[K]]], %[[C0]] {{.*}}: memref<64x64xi32>, vector<16xi32> // CHECK: %[[BV:.*]] = vector.transfer_read %[[MEMREF1]][%[[K]], %[[J]]], %[[C0]] {{.*}}: memref<64x64xi32>, vector<16xi32> @@ -188,7 +188,7 @@ // CHECK: %[[T1:.*]] = arith.addi %[[ACC]], %[[T0]] : vector<16xi32> // CHECK: affine.yield %[[T1]] : vector<16xi32> // CHECK: } -// CHECK: vector.transfer_write %[[R]], %[[MEMREF2]][%[[I]], %[[J]]] : vector<16xi32>, memref<64x64xi32> +// CHECK: vector.transfer_write %[[R]], %[[MEMREF2]][%[[I]], %[[J]]] {{.*}} : vector<16xi32>, memref<64x64xi32> // CHECK: } // CHECK: } func.func @hoist_vector_transfer_pairs_in_affine_loops(%memref0: memref<64x64xi32>, %memref1: memref<64x64xi32>, %memref2: memref<64x64xi32>) { @@ -196,12 +196,12 @@ affine.for %arg3 = 0 to 64 { affine.for %arg4 = 0 to 64 step 16 { affine.for %arg5 = 0 to 64 { - %0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {permutation_map = affine_map<(d0, d1) -> (0)>} : memref<64x64xi32>, vector<16xi32> - %1 = vector.transfer_read %memref1[%arg5, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32> - %2 = vector.transfer_read %memref2[%arg3, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32> + %0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {permutation_map = affine_map<(d0, d1) -> (0)>, in_bounds = [true, true]} : memref<64x64xi32>, vector<16xi32> + %1 = vector.transfer_read %memref1[%arg5, %arg4], %c0_i32 {in_bounds = [true, false]} : memref<64x64xi32>, vector<16xi32> + %2 = vector.transfer_read %memref2[%arg3, %arg4], %c0_i32 {in_bounds = [true, false]} : memref<64x64xi32>, vector<16xi32> %3 = arith.muli %0, %1 : vector<16xi32> %4 = arith.addi %2, %3 : vector<16xi32> - vector.transfer_write %4, %memref2[%arg3, %arg4] : vector<16xi32>, memref<64x64xi32> + vector.transfer_write %4, %memref2[%arg3, %arg4] {in_bounds = [true, false]} : vector<16xi32>, memref<64x64xi32> } } } @@ -264,12 +264,12 @@ %arg9 = %arg3, %arg10 = %arg4, %arg11 = %arg5) -> (tensor, tensor, tensor, tensor, tensor, tensor) { - %r0 = vector.transfer_read %arg7[%c0, %c0], %cst: tensor, vector<1xf32> - %r1 = vector.transfer_read %arg6[%i, %i], %cst: tensor, vector<2xf32> - %r3 = vector.transfer_read %arg9[%c0, %c0], %cst: tensor, vector<4xf32> + %r0 = vector.transfer_read %arg7[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<1xf32> + %r1 = vector.transfer_read %arg6[%i, %i], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> + %r3 = vector.transfer_read %arg9[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<4xf32> "some_crippling_use"(%arg10) : (tensor) -> () - %r4 = vector.transfer_read %arg10[%c0, %c0], %cst: tensor, vector<5xf32> - %r5 = vector.transfer_read %arg11[%c0, %c0], %cst: tensor, vector<6xf32> + %r4 = vector.transfer_read %arg10[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<5xf32> + %r5 = vector.transfer_read %arg11[%c0, %c0], %cst{in_bounds = [true, false]} : tensor, vector<6xf32> "some_crippling_use"(%arg11) : (tensor) -> () %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32> %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32> @@ -277,12 +277,12 @@ %u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32> %u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32> %u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32> - %w1 = vector.transfer_write %u0, %arg7[%c0, %c0] : vector<1xf32>, tensor - %w0 = vector.transfer_write %u1, %arg6[%i, %i] : vector<2xf32>, tensor - %w2 = vector.transfer_write %u2, %arg8[%c0, %c0] : vector<3xf32>, tensor - %w3 = vector.transfer_write %u3, %arg9[%c0, %c0] : vector<4xf32>, tensor - %w4 = vector.transfer_write %u4, %arg10[%c0, %c0] : vector<5xf32>, tensor - %w5 = vector.transfer_write %u5, %arg11[%c0, %c0] : vector<6xf32>, tensor + %w1 = vector.transfer_write %u0, %arg7[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, tensor + %w0 = vector.transfer_write %u1, %arg6[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor + %w2 = vector.transfer_write %u2, %arg8[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor + %w3 = vector.transfer_write %u3, %arg9[%c0, %c0] {in_bounds = [true, false]} : vector<4xf32>, tensor + %w4 = vector.transfer_write %u4, %arg10[%c0, %c0] {in_bounds = [true, false]} : vector<5xf32>, tensor + %w5 = vector.transfer_write %u5, %arg11[%c0, %c0] {in_bounds = [true, false]} : vector<6xf32>, tensor "some_crippling_use"(%w3) : (tensor) -> () scf.yield %w0, %w1, %w2, %w3, %w4, %w5 : tensor, tensor, tensor, tensor, @@ -361,14 +361,14 @@ iter_args(%arg4 = %arg0, %arg5 = %arg1, %arg6 = %arg2, %arg7 = %arg3) -> (tensor, tensor, tensor, tensor) { - %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor, vector<2xf32> - %r01 = vector.transfer_read %arg5[%c0, %c1], %cst: tensor, vector<2xf32> - %r20 = vector.transfer_read %arg6[%c0, %c0], %cst: tensor, vector<3xf32> - %r21 = vector.transfer_read %arg6[%c0, %c3], %cst: tensor, vector<3xf32> - %r30 = vector.transfer_read %arg7[%c0, %random_index], %cst: tensor, vector<4xf32> - %r31 = vector.transfer_read %arg7[%c1, %random_index], %cst: tensor, vector<4xf32> - %r10 = vector.transfer_read %arg4[%i, %i], %cst: tensor, vector<2xf32> - %r11 = vector.transfer_read %arg4[%random_index, %random_index], %cst: tensor, vector<2xf32> + %r00 = vector.transfer_read %arg5[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> + %r01 = vector.transfer_read %arg5[%c0, %c1], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> + %r20 = vector.transfer_read %arg6[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<3xf32> + %r21 = vector.transfer_read %arg6[%c0, %c3], %cst {in_bounds = [true, false]} : tensor, vector<3xf32> + %r30 = vector.transfer_read %arg7[%c0, %random_index], %cst {in_bounds = [true, false]} : tensor, vector<4xf32> + %r31 = vector.transfer_read %arg7[%c1, %random_index], %cst {in_bounds = [true, false]} : tensor, vector<4xf32> + %r10 = vector.transfer_read %arg4[%i, %i], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> + %r11 = vector.transfer_read %arg4[%random_index, %random_index], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32> %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32> %u20 = "some_use"(%r20) : (vector<3xf32>) -> vector<3xf32> @@ -377,14 +377,14 @@ %u31 = "some_use"(%r31) : (vector<4xf32>) -> vector<4xf32> %u10 = "some_use"(%r10) : (vector<2xf32>) -> vector<2xf32> %u11 = "some_use"(%r11) : (vector<2xf32>) -> vector<2xf32> - %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor - %w11 = vector.transfer_write %u01, %w10[%c0, %c1] : vector<2xf32>, tensor - %w20 = vector.transfer_write %u20, %arg6[%c0, %c0] : vector<3xf32>, tensor - %w21 = vector.transfer_write %u21, %w20[%c0, %c3] : vector<3xf32>, tensor - %w30 = vector.transfer_write %u30, %arg7[%c0, %random_index] : vector<4xf32>, tensor - %w31 = vector.transfer_write %u31, %w30[%c1, %random_index] : vector<4xf32>, tensor - %w00 = vector.transfer_write %u10, %arg4[%i, %i] : vector<2xf32>, tensor - %w01 = vector.transfer_write %u11, %w00[%random_index, %random_index] : vector<2xf32>, tensor + %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] {in_bounds = [true, false]} : vector<2xf32>, tensor + %w11 = vector.transfer_write %u01, %w10[%c0, %c1] {in_bounds = [true, false]} : vector<2xf32>, tensor + %w20 = vector.transfer_write %u20, %arg6[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor + %w21 = vector.transfer_write %u21, %w20[%c0, %c3] {in_bounds = [true, false]} : vector<3xf32>, tensor + %w30 = vector.transfer_write %u30, %arg7[%c0, %random_index] {in_bounds = [true, false]} : vector<4xf32>, tensor + %w31 = vector.transfer_write %u31, %w30[%c1, %random_index] {in_bounds = [true, false]} : vector<4xf32>, tensor + %w00 = vector.transfer_write %u10, %arg4[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor + %w01 = vector.transfer_write %u11, %w00[%random_index, %random_index] {in_bounds = [true, false]} : vector<2xf32>, tensor scf.yield %w01, %w11, %w21, %w31 : tensor, tensor, tensor, tensor } scf.yield %1#0, %1#1, %1#2, %1#3 : tensor, tensor, tensor, tensor @@ -444,19 +444,19 @@ -> (tensor, tensor, tensor) { // Hoists. %st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor to tensor - %r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor, vector<1xf32> + %r0 = vector.transfer_read %st0[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<1xf32> // CHECK: %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor to tensor // CHECK: %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor, vector<2xf32> // Does not hoist (slice depends on %j) %st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor to tensor - %r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor, vector<2xf32> + %r1 = vector.transfer_read %st1[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> // CHECK: %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor to tensor // CHECK: %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor, vector<3xf32> // Does not hoist, 2 slice %arg8. %st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor to tensor - %r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor, vector<3xf32> + %r2 = vector.transfer_read %st2[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<3xf32> // CHECK: %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32> // CHECK: %[[U1:.*]] = "some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32> @@ -466,15 +466,15 @@ %u2 = "some_use"(%r2) : (vector<3xf32>) -> vector<3xf32> // Hoists - %w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor + %w0 = vector.transfer_write %u0, %st0[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, tensor // CHECK-DAG: %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor // Does not hoist (associated slice depends on %j). - %w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor + %w1 = vector.transfer_write %u1, %st1[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor // CHECK-DAG: %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor // Does not hoist, 2 slice / insert_slice for %arg8. - %w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor + %w2 = vector.transfer_write %u2, %st2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor // Hoists. %sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor into tensor @@ -530,8 +530,8 @@ // CHECK: %[[R5:.*]] = "some_use"(%[[R3]]) : (vector<2xf32>) -> vector<2xf32> // CHECK: scf.yield %[[R5]], %[[R4]] : vector<2xf32>, vector<2xf32> // CHECK: } -// CHECK: %[[W0:.*]] = vector.transfer_write %[[F]]#1, %[[T]][%[[C0]], %[[C0]]] : vector<2xf32>, tensor -// CHECK: %[[W1:.*]] = vector.transfer_write %[[F]]#0, %[[W0]][%[[C0]], %[[C3]]] : vector<2xf32>, tensor +// CHECK: %[[W0:.*]] = vector.transfer_write %[[F]]#1, %[[T]][%[[C0]], %[[C0]]] {{.*}} : vector<2xf32>, tensor +// CHECK: %[[W1:.*]] = vector.transfer_write %[[F]]#0, %[[W0]][%[[C0]], %[[C3]]] {{.*}} : vector<2xf32>, tensor // CHECK: return %[[W1]] : tensor func.func @hoist_vector_transfer_write_pairs_disjoint_tensor( %tensor: tensor, @@ -543,14 +543,14 @@ %cst = arith.constant 0.0 : f32 %1 = scf.for %j = %lb to %ub step %step iter_args(%arg5 = %tensor) -> (tensor) { - %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor, vector<2xf32> + %r00 = vector.transfer_read %arg5[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32> - %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor + %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] {in_bounds = [true, false]} : vector<2xf32>, tensor // Hoist by properly bypassing the disjoint write %w10. - %r01 = vector.transfer_read %w10[%c0, %c3], %cst: tensor, vector<2xf32> + %r01 = vector.transfer_read %w10[%c0, %c3], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32> - %w11 = vector.transfer_write %u01, %w10[%c0, %c3] : vector<2xf32>, tensor + %w11 = vector.transfer_write %u01, %w10[%c0, %c3] {in_bounds = [true, false]} : vector<2xf32>, tensor scf.yield %w11 : tensor } return %1 : tensor @@ -604,19 +604,19 @@ -> (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>) { // Hoists. %st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<100x100xf32> to tensor - %r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor, vector<1xf32> + %r0 = vector.transfer_read %st0[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<1xf32> // CHECK: %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<200x200xf32> to tensor // CHECK: %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor, vector<2xf32> // Does not hoist (slice depends on %j) %st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<200x200xf32> to tensor - %r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor, vector<2xf32> + %r1 = vector.transfer_read %st1[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<2xf32> // CHECK: %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<300x300xf32> to tensor // CHECK: %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor, vector<3xf32> // Does not hoist, 2 slice %arg8. %st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<300x300xf32> to tensor - %r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor, vector<3xf32> + %r2 = vector.transfer_read %st2[%c0, %c0], %cst {in_bounds = [true, false]} : tensor, vector<3xf32> // CHECK: %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32> // CHECK: %[[U1:.*]] = "some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32> @@ -626,15 +626,15 @@ %u2 = "some_use"(%r2) : (vector<3xf32>) -> vector<3xf32> // Hoists - %w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor + %w0 = vector.transfer_write %u0, %st0[%c0, %c0] {in_bounds = [true, false]} : vector<1xf32>, tensor // CHECK-DAG: %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor // Does not hoist (associated slice depends on %j). - %w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor + %w1 = vector.transfer_write %u1, %st1[%i, %i] {in_bounds = [true, false]} : vector<2xf32>, tensor // CHECK-DAG: %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor // Does not hoist, 2 slice / insert_slice for %arg8. - %w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor + %w2 = vector.transfer_write %u2, %st2[%c0, %c0] {in_bounds = [true, false]} : vector<3xf32>, tensor // Hoists. %sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor into tensor<100x100xf32> diff --git a/mlir/test/Dialect/Linalg/vectorization-masked.mlir b/mlir/test/Dialect/Linalg/vectorization-masked.mlir --- a/mlir/test/Dialect/Linalg/vectorization-masked.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-masked.mlir @@ -53,7 +53,7 @@ // CHECK-LABEL: @vectorize_dynamic_1d_broadcast // CHECK: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK: %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor -// CHECK: %[[VAL_7:.*]] = vector.transfer_read %{{.*}} {permutation_map = #{{.*}}} : tensor, vector<4xf32> +// CHECK: %[[VAL_7:.*]] = vector.transfer_read %{{.*}} {in_bounds = [true], permutation_map = #{{.*}}} : tensor, vector<4xf32> // CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_4]] : vector<4xi1> // CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor, vector<4xf32> } : vector<4xi1> -> vector<4xf32> // CHECK: %[[VAL_12:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor, vector<4xf32> } : vector<4xi1> -> vector<4xf32> @@ -464,10 +464,10 @@ // CHECK-DAG: %[[VAL_9:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[VAL_11:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1> -// CHECK: %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_10]] {in_bounds = [true, true, true], permutation_map = #map} : memref, vector<8x16x4xf32> } : vector<8x4xi1> -> vector<8x16x4xf32> +// CHECK: %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_10]] {in_bounds = [true, true], permutation_map = #map} : memref, vector<8x16x4xf32> } : vector<8x4xi1> -> vector<8x16x4xf32> // CHECK: %[[VAL_13:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[VAL_14:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x16xi1> -// CHECK: %[[VAL_15:.*]] = vector.mask %[[VAL_14]] { vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_13]] {in_bounds = [true, true, true], permutation_map = #map1} : memref, vector<8x16x4xf32> } : vector<4x16xi1> -> vector<8x16x4xf32> +// CHECK: %[[VAL_15:.*]] = vector.mask %[[VAL_14]] { vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_13]] {in_bounds = [true, true], permutation_map = #map1} : memref, vector<8x16x4xf32> } : vector<4x16xi1> -> vector<8x16x4xf32> // CHECK: %[[VAL_16:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[VAL_17:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x16xi1> // CHECK: %[[VAL_18:.*]] = vector.mask %[[VAL_17]] { vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_9]], %[[VAL_9]]], %[[VAL_16]] {in_bounds = [true, true]} : memref, vector<8x16xf32> } : vector<8x16xi1> -> vector<8x16xf32> diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -663,10 +663,10 @@ // CHECK: func @generic_vectorize_broadcast_transpose // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[CF:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP0]]} : memref<4x4xf32>, vector<4x4x4x4xf32> -// CHECK: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP1]]} : memref<4xf32>, vector<4x4x4x4xf32> -// CHECK: %[[V2:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP2]]} : memref<4xf32>, vector<4x4x4x4xf32> -// CHECK: %[[V3:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true, true, true], permutation_map = #[[$MAP3]]} : memref<4x4xf32>, vector<4x4x4x4xf32> +// CHECK: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true], permutation_map = #[[$MAP0]]} : memref<4x4xf32>, vector<4x4x4x4xf32> +// CHECK: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true], permutation_map = #[[$MAP1]]} : memref<4xf32>, vector<4x4x4x4xf32> +// CHECK: %[[V2:.*]] = vector.transfer_read %{{.*}}[%[[C0]]], %[[CF]] {in_bounds = [true], permutation_map = #[[$MAP2]]} : memref<4xf32>, vector<4x4x4x4xf32> +// CHECK: %[[V3:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF]] {in_bounds = [true, true], permutation_map = #[[$MAP3]]} : memref<4x4xf32>, vector<4x4x4x4xf32> // CHECK: %[[SUB:.*]] = arith.subf %[[V0]], %[[V1]] : vector<4x4x4x4xf32> // CHECK: %[[ADD0:.*]] = arith.addf %[[V2]], %[[SUB]] : vector<4x4x4x4xf32> // CHECK: %[[ADD1:.*]] = arith.addf %[[V3]], %[[ADD0]] : vector<4x4x4x4xf32> @@ -715,8 +715,8 @@ // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (0, d1, 0, d0)> // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)> // CHECK: func @vectorization_transpose -// CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true, true, true], permutation_map = #[[MAP0]]} : memref<14x7xf32>, vector<7x14x8x16xf32> -// CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true, true, true], permutation_map = #[[MAP1]]} : memref<16x14xf32>, vector<7x14x8x16xf32> +// CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true], permutation_map = #[[MAP0]]} : memref<14x7xf32>, vector<7x14x8x16xf32> +// CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true], permutation_map = #[[MAP1]]} : memref<16x14xf32>, vector<7x14x8x16xf32> // CHECK: vector.transfer_read {{.*}}{in_bounds = [true, true, true, true], permutation_map = #[[MAP2]]} : memref<16x14x7x8xf32>, vector<7x14x8x16xf32> // CHECK: arith.addf {{.*}} : vector<7x14x8x16xf32> // CHECK: arith.addf {{.*}} : vector<7x14x8x16xf32> @@ -1132,8 +1132,8 @@ func.func @sum_exp_2(%input: tensor<3x2xf32>, %input_2: tensor<5x4xf32>, %output: tensor<5x2xf32>) -> tensor<5x2xf32> { - // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true, true], permutation_map = #[[$M1]]} : tensor<3x2xf32>, vector<2x3x4x5xf32> - // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true, true], permutation_map = #[[$M2]]} : tensor<5x4xf32>, vector<2x3x4x5xf32> + // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true], permutation_map = #[[$M1]]} : tensor<3x2xf32>, vector<2x3x4x5xf32> + // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true], permutation_map = #[[$M2]]} : tensor<5x4xf32>, vector<2x3x4x5xf32> // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true], permutation_map = #[[$M3]]} : tensor<5x2xf32>, vector<2x5xf32> // CHECK: math.exp {{.*}} : vector<2x3x4x5xf32> // CHECK: math.exp {{.*}} : vector<2x3x4x5xf32> diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir --- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir +++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir @@ -149,7 +149,7 @@ // CHECK: %[[VAL_17:.*]] = vector.shape_cast %[[VAL_12]] : vector<1x4xindex> to vector<4xindex> // CHECK: %[[VAL_18:.*]] = vector.extractelement %[[VAL_17]]{{\[}}%[[VAL_7]] : i32] : vector<4xindex> // CHECK: %[[VAL_19:.*]] = vector.extractelement %[[VAL_16]]{{\[}}%[[VAL_7]] : i32] : vector<4xindex> -// CHECK: %[[VAL_20:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_18]], %[[VAL_10]], %[[VAL_19]]], %[[VAL_8]] {in_bounds = [true, true]} : tensor<45x80x16xf32>, vector<1x4xf32> +// CHECK: %[[VAL_20:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_18]], %[[VAL_10]], %[[VAL_19]]], %[[VAL_8]] {in_bounds = [true, true, true]} : tensor<45x80x16xf32>, vector<1x4xf32> // CHECK: %[[VAL_21:.*]] = vector.transfer_write %[[VAL_20]], %[[VAL_5]]{{\[}}%[[VAL_9]], %[[VAL_9]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> // CHECK: return %[[VAL_21]] : tensor<1x4xf32> // CHECK: } diff --git a/mlir/test/Dialect/MemRef/extract-address-computations.mlir b/mlir/test/Dialect/MemRef/extract-address-computations.mlir --- a/mlir/test/Dialect/MemRef/extract-address-computations.mlir +++ b/mlir/test/Dialect/MemRef/extract-address-computations.mlir @@ -267,13 +267,13 @@ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[CF0:.*]] = arith.constant 0.0{{0*e\+00}} : f16 // CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] [%[[DYN_SIZE0]], %[[DYN_SIZE1]], %[[DYN_SIZE2]]] [1, 1, 1] : memref to memref> -// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]], %[[CF0]] {permutation_map = #[[$PERMUTATION_MAP]]} : memref>, vector<4x2xf16> +// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]], %[[CF0]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : memref>, vector<4x2xf16> // CHECK: return %[[LOADED_VAL]] : vector<4x2xf16> func.func @test_transfer_read_op(%base : memref, %offset0 : index, %offset1: index, %offset2: index) -> vector<4x2xf16> { %cf0 = arith.constant 0.0 : f16 - %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : memref, vector<4x2xf16> + %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : memref, vector<4x2xf16> return %loaded_val : vector<4x2xf16> } @@ -297,13 +297,13 @@ // CHECK-SAME: %[[DYN_OFFSET1:[^:]*]]: index, // CHECK-SAME: %[[DYN_OFFSET2:[^:]*]]: index) // CHECK: %[[CF0:.*]] = arith.constant 0.0{{0*e\+00}} : f16 -// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]], %[[CF0]] {permutation_map = #[[$PERMUTATION_MAP]]} : tensor, vector<4x2xf16> +// CHECK: %[[LOADED_VAL:.*]] = vector.transfer_read %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]], %[[CF0]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : tensor, vector<4x2xf16> // CHECK: return %[[LOADED_VAL]] : vector<4x2xf16> func.func @test_transfer_read_op_with_tensor(%base : tensor, %offset0 : index, %offset1: index, %offset2: index) -> vector<4x2xf16> { %cf0 = arith.constant 0.0 : f16 - %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : tensor, vector<4x2xf16> + %loaded_val = vector.transfer_read %base[%offset0, %offset1, %offset2], %cf0 { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : tensor, vector<4x2xf16> return %loaded_val : vector<4x2xf16> } @@ -334,12 +334,12 @@ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VCF0:.*]] = arith.constant dense<0.0{{0*e\+00}}> : vector<4x2xf16> // CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] [%[[DYN_SIZE0]], %[[DYN_SIZE1]], %[[DYN_SIZE2]]] [1, 1, 1] : memref to memref> -// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref> +// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref> // CHECK: return func.func @test_transfer_write_op(%base : memref, %offset0 : index, %offset1: index, %offset2: index) { %vcf0 = arith.constant dense<0.000000e+00> : vector<4x2xf16> - vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref + vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref return } @@ -371,12 +371,12 @@ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VCF0:.*]] = arith.constant dense<0.0{{0*e\+00}}> : vector<4x2xf16> // CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] [%[[DYN_SIZE0]], %[[DYN_SIZE1]], %[[DYN_SIZE2]]] [1, 1, 1] : memref> to memref> -// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref> +// CHECK: vector.transfer_write %[[VCF0]], %[[SUBVIEW]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, memref> // CHECK: return func.func @test_transfer_write_op_with_strides(%base : memref>, %offset0 : index, %offset1: index, %offset2: index) { %vcf0 = arith.constant dense<0.000000e+00> : vector<4x2xf16> - vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref> + vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, memref> return } @@ -400,12 +400,12 @@ // CHECK-SAME: %[[DYN_OFFSET1:[^:]*]]: index, // CHECK-SAME: %[[DYN_OFFSET2:[^:]*]]: index) // CHECK-DAG: %[[VCF0:.*]] = arith.constant dense<0.0{{0*e\+00}}> : vector<4x2xf16> -// CHECK: %[[RES:.*]] = vector.transfer_write %[[VCF0]], %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] {permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, tensor +// CHECK: %[[RES:.*]] = vector.transfer_write %[[VCF0]], %[[BASE]][%[[DYN_OFFSET0]], %[[DYN_OFFSET1]], %[[DYN_OFFSET2]]] {in_bounds = [false, true, false], permutation_map = #[[$PERMUTATION_MAP]]} : vector<4x2xf16>, tensor // CHECK: return %[[RES]] : tensor func.func @test_transfer_write_op_with_tensor(%base : tensor, %offset0 : index, %offset1: index, %offset2: index) -> tensor { %vcf0 = arith.constant dense<0.000000e+00> : vector<4x2xf16> - %res = vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, tensor + %res = vector.transfer_write %vcf0, %base[%offset0, %offset1, %offset2] { in_bounds = [false, true, false], permutation_map = affine_map<(d0,d1,d2) -> (d2,d0)> } : vector<4x2xf16>, tensor return %res : tensor } diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir --- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir +++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir @@ -102,7 +102,7 @@ %f1 = arith.constant 1.0 : f32 %0 = memref.subview %arg0[%arg1, %arg2][4, 4][%arg5, %arg6] : memref<12x32xf32> to memref<4x4xf32, strided<[?, ?], offset: ?>> - %1 = vector.transfer_read %0[%arg3, %arg4], %f1 {in_bounds = [true]} : memref<4x4xf32, strided<[?, ?], offset: ?>>, vector<4xf32> + %1 = vector.transfer_read %0[%arg3, %arg4], %f1 {in_bounds = [true, true]} : memref<4x4xf32, strided<[?, ?], offset: ?>>, vector<4xf32> return %1 : vector<4xf32> } // CHECK: func @fold_subview_with_transfer_read @@ -132,7 +132,7 @@ func.func @fold_static_stride_subview_with_transfer_write(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5: index, %arg6 : index, %arg7 : vector<4xf32>) { %0 = memref.subview %arg0[%arg1, %arg2][4, 4][%arg5, %arg6] : memref<12x32xf32> to memref<4x4xf32, strided<[?, ?], offset: ?>> - vector.transfer_write %arg7, %0[%arg3, %arg4] {in_bounds = [true]} : vector<4xf32>, memref<4x4xf32, strided<[?, ?], offset: ?>> + vector.transfer_write %arg7, %0[%arg3, %arg4] {in_bounds = [true, true]} : vector<4xf32>, memref<4x4xf32, strided<[?, ?], offset: ?>> return } // CHECK: func @fold_static_stride_subview_with_transfer_write @@ -186,7 +186,7 @@ %0 = memref.subview %arg0[0, %arg1, %arg2] [1, %arg3, %arg4] [1, 1, 1] : memref> to memref> - %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true]} + %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true, true]} : memref>, vector<4xf32> return %1 : vector<4xf32> } @@ -214,7 +214,7 @@ %0 = memref.subview %arg0[0, %arg2, %arg3] [1, %arg4, %arg5] [1, 1, 1] : memref> to memref> - vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true]} + vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true, true]} : vector<4xf32>, memref> return } @@ -231,7 +231,7 @@ // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]] // CHECK-DAG: %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]] -// CHECK-DAG: vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]] {in_bounds = [true]} : vector<4xf32>, memref, memref> to memref> - vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true]} + vector.transfer_write %arg1, %0[%arg6, %arg7] {in_bounds = [true, true]} : vector<4xf32>, memref> return } @@ -262,7 +262,7 @@ // CHECK-DAG: %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]] // CHECK-DAG: %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]] // CHECK-DAG: vector.transfer_write %[[ARG1]], %[[ARG0]][%[[IDX0]], %[[IDX1]], %[[C0]]] -// CHECK-SAME: {in_bounds = [true], permutation_map = #[[MAP2]]} : vector<4xf32>, memref, memref, %[[s1:.*]]: index, %[[s2:.*]]: index // CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index // CHECK: %[[add:.*]] = affine.apply #[[$map]]()[%[[s1]]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor, vector<6xf32> +// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<6xf32> // CHECK: return %[[r]] func.func @transfer_read_of_extract_slice_1d(%t : tensor, %s1 : index, %s2 : index) -> vector<6xf32> { %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index %cst = arith.constant 0.0 : f32 %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor to tensor<10x?xf32> - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32> + %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<6xf32> return %1 : vector<6xf32> } @@ -46,7 +46,7 @@ // CHECK-DAG: %[[c5:.*]] = arith.constant 5 : index // CHECK-DAG: %[[c10:.*]] = arith.constant 10 : index // CHECK: %[[add:.*]] = affine.apply #[[$map1]]()[%[[s1]]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<5x6xf32> +// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true, true]} : tensor, vector<5x6xf32> // CHECK: return %[[r]] func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { %c3 = arith.constant 3 : index @@ -61,7 +61,7 @@ // CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index // CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index // CHECK-DAG: %[[c10:.*]] = arith.constant 10 : index -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[s1]], %[[c10]]], %{{.*}} {in_bounds = [true, true], permutation_map = #[[$map2]]} : tensor, vector<5x6xf32> +// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[s1]], %[[c10]]], %{{.*}} {in_bounds = [true, true, true], permutation_map = #[[$map2]]} : tensor, vector<5x6xf32> // CHECK: return %[[r]] func.func @transfer_read_of_extract_slice_non_leading_rank_reduction(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { %c3 = arith.constant 3 : index @@ -88,7 +88,7 @@ // CHECK-SAME: %[[t1:.*]]: tensor, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index // CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true], permutation_map = #[[$map2]]} : vector<5x6xf32>, tensor +// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true, true], permutation_map = #[[$map2]]} : vector<5x6xf32>, tensor func.func @insert_slice_of_transfer_write_non_leading_rank_reduction(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { %c0 = arith.constant 0 : index %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> @@ -100,7 +100,7 @@ // CHECK-SAME: %[[t1:.*]]: tensor, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index // CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor +// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true, true]} : vector<5x6xf32>, tensor // CHECK: return %[[r]] func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir --- a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir +++ b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir @@ -8,7 +8,7 @@ %0 = tensor.extract_slice %arg0[0, %arg1, %arg2] [1, %arg3, %arg4] [1, 1, 1] : tensor to tensor - %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true]} + %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true, true]} : tensor, vector<4xf32> return %1 : vector<4xf32> } @@ -40,7 +40,7 @@ // Can't fold this atm since we don' emit the proper vector.extract_strided_slice. // CHECK: tensor.extract_slice %0 = tensor.extract_slice %src[0, %i1, %i2, %i3] [1, 4, 1, 4] [2, 3, 4, 5] : tensor<1x8x8x8xf32> to tensor<1x4x4xf32> - %1 = vector.transfer_read %0[%c1, %i4, %c2], %f0 {in_bounds = [true]} : tensor<1x4x4xf32>, vector<4xf32> + %1 = vector.transfer_read %0[%c1, %i4, %c2], %f0 {in_bounds = [true, true, true]} : tensor<1x4x4xf32>, vector<4xf32> return %1 : vector<4xf32> } @@ -87,14 +87,14 @@ // CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index // CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index // CHECK: %[[add:.*]] = affine.apply #[[$ADD_4]]()[%[[s1]]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor, vector<6xf32> +// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<6xf32> // CHECK: return %[[r]] func.func @transfer_read_of_extract_slice(%t : tensor, %s1 : index, %s2 : index) -> vector<6xf32> { %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index %cst = arith.constant 0.0 : f32 %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor to tensor<10x?xf32> - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32> + %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<6xf32> return %1 : vector<6xf32> } @@ -107,7 +107,7 @@ // CHECK-DAG: %[[c5:.*]] = arith.constant 5 : index // CHECK-DAG: %[[c10:.*]] = arith.constant 10 : index // CHECK: %[[add:.*]] = affine.apply #[[$ADD_3]]()[%[[s1]]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<5x6xf32> +// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true, true]} : tensor, vector<5x6xf32> // CHECK: return %[[r]] func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { %c3 = arith.constant 3 : index @@ -166,8 +166,8 @@ // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]] // CHECK-DAG: %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]] -// CHECK-DAG: vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]] {in_bounds = [true]} : vector<4xf32>, tensor, tensor, tensor %1 = tensor.insert_slice %0 into %arg0[0, %arg2, %arg3] [1, %arg4, %arg5] [1, 1, 1] : tensor into tensor @@ -200,8 +200,8 @@ // CHECK-DAG: %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]] // CHECK-DAG: %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]] // CHECK-DAG: vector.transfer_write %[[ARG1]], %[[ARG0]][%[[IDX0]], %[[IDX1]], %[[C0]]] - // CHECK-SAME: {in_bounds = [true], permutation_map = #[[MAP2]]} : vector<4xf32>, tensor, tensor, tensor %1 = tensor.insert_slice %0 into %arg0[%arg2, %arg3, 0] [%arg4, %arg5, 1] [1, 1, 1] : tensor into tensor @@ -239,7 +239,7 @@ // CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index // CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] -// CHECK-SAME: {in_bounds = [true, true], permutation_map = #[[$d0d2]]} : vector<5x6xf32>, tensor +// CHECK-SAME: {in_bounds = [true, true, true], permutation_map = #[[$d0d2]]} : vector<5x6xf32>, tensor // CHECK: return %[[r]] %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [5, 1, 6] [1, 1, 1] : tensor<5x6xf32> into tensor @@ -252,7 +252,7 @@ // CHECK-SAME: %[[t1:.*]]: tensor, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index // CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor +// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true, true]} : vector<5x6xf32>, tensor // CHECK: return %[[r]] func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -727,12 +727,16 @@ // ----- // CHECK-LABEL: fold_vector_transfers -func.func @fold_vector_transfers(%A: memref) -> (vector<4x8xf32>, vector<4x9xf32>) { +func.func @fold_vector_transfers(%A: memref, + %B: memref<9x8xf32>, + %idx: index) + -> (vector<4x8xf32>, vector<4x9xf32>, vector<1x2xf32>, vector<5xf32>) +{ %c0 = arith.constant 0 : index %f0 = arith.constant 0.0 : f32 - // CHECK: vector.transfer_read %{{.*}} {in_bounds = [false, true]} - %1 = vector.transfer_read %A[%c0, %c0], %f0 : memref, vector<4x8xf32> + // CHECK: vector.transfer_read %{{.*}} {in_bounds = [true, true]} + %1 = vector.transfer_read %A[%c0, %c0], %f0 {in_bounds = [true, true]} : memref, vector<4x8xf32> // CHECK: vector.transfer_write %{{.*}} {in_bounds = [false, true]} vector.transfer_write %1, %A[%c0, %c0] : vector<4x8xf32>, memref @@ -747,8 +751,17 @@ // CHECK-NOT: in_bounds vector.transfer_write %2, %A[%c0, %c0] : vector<4x9xf32>, memref + // Cannot infer in_bounds for non-constant offsets or dynamic dim sizes. + // CHECK: vector.transfer_read {{.*}} + // CHECK-NOT: in_bounds + %3 = vector.transfer_read %A[%c0, %idx], %f0 : memref, vector<1x2xf32> + + // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true]} + %4 = vector.transfer_read %B[%c0, %c0], %f0 {in_bounds = [true, false]} : memref<9x8xf32>, vector<5xf32> + // CHECK: return - return %1, %2 : vector<4x8xf32>, vector<4x9xf32> + return %1, %2, %3, %4 + : vector<4x8xf32>, vector<4x9xf32>, vector<1x2xf32>, vector<5xf32> } // ----- @@ -1130,7 +1143,7 @@ %cf0 = arith.constant 0.0 : f32 %w0 = vector.transfer_write %v0, %arg0[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, tensor<4x4xf32> - %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true, true], + %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d0, d1, 0)>} : tensor<4x4xf32>, vector<4x2x6xf32> return %0 : vector<4x2x6xf32> @@ -1147,10 +1160,10 @@ %v0 : vector<4x1xf32>) -> vector<1x100x4x5xf32> { %c0 = arith.constant 0 : index %cf0 = arith.constant 0.0 : f32 - %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0] {in_bounds = [true, true], + %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0] {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1, d2) -> (d2, d1)>} : vector<4x1xf32>, tensor<4x4x4xf32> - %0 = vector.transfer_read %w0[%c0, %c0, %c0], %cf0 {in_bounds = [true, true, true, true], + %0 = vector.transfer_read %w0[%c0, %c0, %c0], %cf0 {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1, d2) -> (d1, 0, d2, 0)>} : tensor<4x4x4xf32>, vector<1x100x4x5xf32> return %0 : vector<1x100x4x5xf32> @@ -1230,7 +1243,7 @@ // CHECK-SAME: [%[[IV]], 16] [%[[SZ]], 8] // CHECK: %[[T1:.*]] = vector.transfer_write %[[VEC]] // CHECK-SAME: %[[T0]][%[[C0]], %[[C0]]] - // CHECK-SAME: in_bounds = [true, false] + // CHECK-SAME: in_bounds = [false, true] // CHECK-SAME: permutation_map = #[[$MAP]] // CHECK: %[[T2:.*]] = tensor.insert_slice %[[T1]] into %[[ITER_ARG]] // CHECK-SAME: [%[[IV]], 16] [%[[SZ]], 8] @@ -2088,7 +2101,7 @@ %c0 = arith.constant 0 : index %f0 = arith.constant 0.000000e+00 : f32 %0 = tensor.extract_slice %src[0, %i1, %i2, %i3] [1, 4, 1, 4] [1, 1, 1, 1] : tensor<1x8x8x8xf32> to tensor<1x4x4xf32> - %1 = vector.transfer_read %0[%c0, %i4, %c0], %f0 {in_bounds = [true]} : tensor<1x4x4xf32>, vector<4xf32> + %1 = vector.transfer_read %0[%c0, %i4, %c0], %f0 {in_bounds = [true, true, true]} : tensor<1x4x4xf32>, vector<4xf32> return %1 : vector<4xf32> } diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -383,7 +383,7 @@ %c3 = arith.constant 3 : index %cst = arith.constant 3.0 : f32 // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}} - %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + d1)>} : memref, vector<128xf32> + %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + d1)>, in_bounds = [true, true]} : memref, vector<128xf32> } // ----- @@ -392,7 +392,7 @@ %c3 = arith.constant 3 : index %cst = arith.constant 3.0 : f32 // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}} - %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + 1)>} : memref, vector<128xf32> + %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0 + 1)>, in_bounds = [true, true]} : memref, vector<128xf32> } // ----- @@ -401,7 +401,7 @@ %c3 = arith.constant 3 : index %cst = arith.constant 3.0 : f32 // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}} - %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>} : memref, vector<3x7xf32> + %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>, in_bounds = [true, true, true]} : memref, vector<3x7xf32> } // ----- @@ -442,22 +442,12 @@ %c3 = arith.constant 3 : index %f0 = arith.constant 0.0 : f32 %vf0 = vector.splat %f0 : vector<2x3xf32> - // expected-error@+1 {{ expects the optional in_bounds attr of same rank as permutation_map results: affine_map<(d0, d1) -> (d0, d1)>}} + // expected-error@+1 {{ expects the optional in_bounds attr of same rank as the source type: 2 vs inBounds of size: 1}} %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {in_bounds = [true], permutation_map = affine_map<(d0, d1)->(d0, d1)>} : memref>, vector<1x1x2x3xf32> } // ----- -func.func @test_vector.transfer_read(%arg0: memref>) { - %c3 = arith.constant 3 : index - %f0 = arith.constant 0.0 : f32 - %vf0 = vector.splat %f0 : vector<2x3xf32> - // expected-error@+1 {{requires broadcast dimensions to be in-bounds}} - %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {in_bounds = [false, true], permutation_map = affine_map<(d0, d1)->(0, d1)>} : memref>, vector<1x1x2x3xf32> -} - -// ----- - func.func @test_vector.transfer_read(%arg0: memref>) { %c3 = arith.constant 3 : index %f0 = arith.constant 0.0 : f32 @@ -538,7 +528,7 @@ %c3 = arith.constant 3 : index %cst = arith.constant dense<3.0> : vector<128 x f32> // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}} - vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + d1)>} : vector<128xf32>, memref + vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + d1)>, in_bounds = [true, true]} : vector<128xf32>, memref } // ----- @@ -547,7 +537,7 @@ %c3 = arith.constant 3 : index %cst = arith.constant dense<3.0> : vector<128 x f32> // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}} - vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + 1)>} : vector<128xf32>, memref + vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0 + 1)>, in_bounds = [true, true]} : vector<128xf32>, memref } // ----- @@ -556,7 +546,7 @@ %c3 = arith.constant 3 : index %cst = arith.constant dense<3.0> : vector<3 x 7 x f32> // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}} - vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>} : vector<3x7xf32>, memref + vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = affine_map<(d0, d1, d2)->(d0, d0)>, in_bounds = [true, true, true]} : vector<3x7xf32>, memref } // ----- diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir --- a/mlir/test/Dialect/Vector/ops.mlir +++ b/mlir/test/Dialect/Vector/ops.mlir @@ -20,13 +20,13 @@ -> tensor { %c0 = arith.constant 0 : index %f0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %arg0[%c0], %f0 {permutation_map = affine_map<(d0)->()>} : + %0 = vector.transfer_read %arg0[%c0], %f0 {permutation_map = affine_map<(d0)->()>, in_bounds = [true]} : tensor, vector - %1 = vector.transfer_write %0, %arg0[%c0] {permutation_map = affine_map<(d0)->()>} : + %1 = vector.transfer_write %0, %arg0[%c0] {permutation_map = affine_map<(d0)->()>, in_bounds = [true]} : vector, tensor - %2 = vector.transfer_read %arg1[%c0, %c0], %f0 {permutation_map = affine_map<(d0, d1)->()>} : + %2 = vector.transfer_read %arg1[%c0, %c0], %f0 {permutation_map = affine_map<(d0, d1)->()>, in_bounds = [true, true]} : memref, vector - vector.transfer_write %2, %arg1[%c0, %c0] {permutation_map = affine_map<(d0, d1)->()>} : + vector.transfer_write %2, %arg1[%c0, %c0] {permutation_map = affine_map<(d0, d1)->()>, in_bounds = [true, true]} : vector, memref return %1: tensor } @@ -52,40 +52,40 @@ %m2 = vector.splat %i1 : vector<4x5xi1> // // CHECK: vector.transfer_read - %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>} : memref, vector<128xf32> + %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : memref, vector<128xf32> // CHECK: vector.transfer_read %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : memref, vector<3x7xf32> // CHECK: vector.transfer_read - %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>} : memref, vector<128xf32> + %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : memref, vector<128xf32> // CHECK: vector.transfer_read - %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>} : memref, vector<128xf32> + %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>, in_bounds = [true, false]} : memref, vector<128xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : memref>, vector<1x1x4x3xf32> %4 = vector.transfer_read %arg1[%c3, %c3], %vf0 {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : memref>, vector<1x1x4x3xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} {in_bounds = [false, true]} : memref>, vector<1x1x4x3xf32> %5 = vector.transfer_read %arg1[%c3, %c3], %vf0 {in_bounds = [false, true]} : memref>, vector<1x1x4x3xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : memref>, vector<5x24xi8> - %6 = vector.transfer_read %arg2[%c3, %c3], %v0 : memref>, vector<5x24xi8> + %6 = vector.transfer_read %arg2[%c3, %c3], %v0 {in_bounds = [true, true]} : memref>, vector<5x24xi8> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : memref>, vector<5x48xi8> - %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 : memref>, vector<5x48xi8> + %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 {in_bounds = [true, true]} : memref>, vector<5x48xi8> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}}, %{{.*}} : memref, vector<5xf32> - %8 = vector.transfer_read %arg0[%c3, %c3], %f0, %m : memref, vector<5xf32> + %8 = vector.transfer_read %arg0[%c3, %c3], %f0, %m {in_bounds = [true, false]} : memref, vector<5xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]], %[[C3]]], %{{.*}}, %{{.*}} : memref, vector<5x4x8xf32> - %9 = vector.transfer_read %arg4[%c3, %c3, %c3], %f0, %m2 {permutation_map = affine_map<(d0, d1, d2)->(d1, d0, 0)>} : memref, vector<5x4x8xf32> + %9 = vector.transfer_read %arg4[%c3, %c3, %c3], %f0, %m2 {permutation_map = affine_map<(d0, d1, d2)->(d1, d0, 0)>, in_bounds = [false, false, true]} : memref, vector<5x4x8xf32> // CHECK: vector.transfer_write - vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>} : vector<128xf32>, memref + vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : vector<128xf32>, memref // CHECK: vector.transfer_write vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<3x7xf32>, memref // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, memref> vector.transfer_write %4, %arg1[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : vector<1x1x4x3xf32>, memref> // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, memref> vector.transfer_write %5, %arg1[%c3, %c3] {in_bounds = [false, false]} : vector<1x1x4x3xf32>, memref> - // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x24xi8>, memref> - vector.transfer_write %6, %arg2[%c3, %c3] : vector<5x24xi8>, memref> - // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x48xi8>, memref> - vector.transfer_write %7, %arg3[%c3, %c3] : vector<5x48xi8>, memref> + // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x24xi8>, memref> + vector.transfer_write %6, %arg2[%c3, %c3] {in_bounds = [true, true]} : vector<5x24xi8>, memref> + // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x48xi8>, memref> + vector.transfer_write %7, %arg3[%c3, %c3] {in_bounds = [true, true]} : vector<5x48xi8>, memref> // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : vector<5xf32>, memref - vector.transfer_write %8, %arg0[%c3, %c3], %m : vector<5xf32>, memref + vector.transfer_write %8, %arg0[%c3, %c3], %m {in_bounds = [true, false]} : vector<5xf32>, memref return } @@ -112,35 +112,35 @@ // // CHECK: vector.transfer_read - %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>} : tensor, vector<128xf32> + %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : tensor, vector<128xf32> // CHECK: vector.transfer_read - %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : tensor, vector<3x7xf32> + %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = affine_map<(d0, d1)->(d1, d0)>, in_bounds = [true, true]} : tensor, vector<3x7xf32> // CHECK: vector.transfer_read - %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>} : tensor, vector<128xf32> + %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : tensor, vector<128xf32> // CHECK: vector.transfer_read - %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>} : tensor, vector<128xf32> + %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>, in_bounds = [true, false]} : tensor, vector<128xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : tensor>, vector<1x1x4x3xf32> %4 = vector.transfer_read %arg1[%c3, %c3], %vf0 {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : tensor>, vector<1x1x4x3xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} {in_bounds = [false, true]} : tensor>, vector<1x1x4x3xf32> %5 = vector.transfer_read %arg1[%c3, %c3], %vf0 {in_bounds = [false, true]} : tensor>, vector<1x1x4x3xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : tensor>, vector<5x24xi8> - %6 = vector.transfer_read %arg2[%c3, %c3], %v0 : tensor>, vector<5x24xi8> + %6 = vector.transfer_read %arg2[%c3, %c3], %v0 {in_bounds = [true, true]} : tensor>, vector<5x24xi8> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : tensor>, vector<5x48xi8> - %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 : tensor>, vector<5x48xi8> + %7 = vector.transfer_read %arg3[%c3, %c3], %vi0 {in_bounds = [true, true]} : tensor>, vector<5x48xi8> // CHECK: vector.transfer_write - %8 = vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>} : vector<128xf32>, tensor + %8 = vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>, in_bounds = [false, true]} : vector<128xf32>, tensor // CHECK: vector.transfer_write %9 = vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<3x7xf32>, tensor // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, tensor> %10 = vector.transfer_write %4, %arg1[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : vector<1x1x4x3xf32>, tensor> // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, tensor> %11 = vector.transfer_write %5, %arg1[%c3, %c3] {in_bounds = [false, false]} : vector<1x1x4x3xf32>, tensor> - // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x24xi8>, tensor> - %12 = vector.transfer_write %6, %arg2[%c3, %c3] : vector<5x24xi8>, tensor> - // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<5x48xi8>, tensor> - %13 = vector.transfer_write %7, %arg3[%c3, %c3] : vector<5x48xi8>, tensor> + // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x24xi8>, tensor> + %12 = vector.transfer_write %6, %arg2[%c3, %c3] {in_bounds = [true, true]} : vector<5x24xi8>, tensor> + // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {in_bounds = [true, true]} : vector<5x48xi8>, tensor> + %13 = vector.transfer_write %7, %arg3[%c3, %c3] {in_bounds = [true, true]} : vector<5x48xi8>, tensor> return %8, %9, %10, %11, %12, %13 : tensor, tensor, tensor>, diff --git a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir --- a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir +++ b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir @@ -7,7 +7,7 @@ // CHECK: return %[[r]] func.func @transfer_read_0d(%m: memref, %idx: index) -> f32 { %cst = arith.constant 0.0 : f32 - %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst : memref, vector + %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst {in_bounds = [true, true, true]} : memref, vector %1 = vector.extractelement %0[] : vector return %1 : f32 } @@ -23,7 +23,7 @@ func.func @transfer_read_1d(%m: memref, %idx: index, %idx2: index) -> f32 { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index - %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst {in_bounds = [true]} : memref, vector<5xf32> + %0 = vector.transfer_read %m[%idx, %idx, %idx], %cst {in_bounds = [true, true, true]} : memref, vector<5xf32> %1 = vector.extractelement %0[%idx2 : index] : vector<5xf32> return %1 : f32 } @@ -36,7 +36,7 @@ // CHECK: return %[[r]] func.func @tensor_transfer_read_0d(%t: tensor, %idx: index) -> f32 { %cst = arith.constant 0.0 : f32 - %0 = vector.transfer_read %t[%idx, %idx, %idx], %cst : tensor, vector + %0 = vector.transfer_read %t[%idx, %idx, %idx], %cst {in_bounds = [true, true, true]} : tensor, vector %1 = vector.extractelement %0[] : vector return %1 : f32 } @@ -50,7 +50,7 @@ // CHECK: memref.store %[[extract]], %[[m]][%[[idx]], %[[idx]], %[[idx]]] func.func @transfer_write_0d(%m: memref, %idx: index, %f: f32) { %0 = vector.broadcast %f : f32 to vector - vector.transfer_write %0, %m[%idx, %idx, %idx] : vector, memref + vector.transfer_write %0, %m[%idx, %idx, %idx] {in_bounds = [true, true, true]} : vector, memref return } @@ -61,7 +61,7 @@ // CHECK: memref.store %[[f]], %[[m]][%[[idx]], %[[idx]], %[[idx]]] func.func @transfer_write_1d(%m: memref, %idx: index, %f: f32) { %0 = vector.broadcast %f : f32 to vector<1xf32> - vector.transfer_write %0, %m[%idx, %idx, %idx] : vector<1xf32>, memref + vector.transfer_write %0, %m[%idx, %idx, %idx] {in_bounds = [true, true, true]} : vector<1xf32>, memref return } @@ -75,7 +75,7 @@ // CHECK: return %[[r]] func.func @tensor_transfer_write_0d(%t: tensor, %idx: index, %f: f32) -> tensor { %0 = vector.broadcast %f : f32 to vector - %1 = vector.transfer_write %0, %t[%idx, %idx, %idx] : vector, tensor + %1 = vector.transfer_write %0, %t[%idx, %idx, %idx] {in_bounds = [true, true, true]} : vector, tensor return %1 : tensor } @@ -92,7 +92,7 @@ func.func @transfer_read_2d_extract(%m: memref, %idx: index, %idx2: index) -> f32 { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index - %0 = vector.transfer_read %m[%idx, %idx, %idx, %idx], %cst {in_bounds = [true, true]} : memref, vector<10x5xf32> + %0 = vector.transfer_read %m[%idx, %idx, %idx, %idx], %cst {in_bounds = [true, true, true, true]} : memref, vector<10x5xf32> %1 = vector.extract %0[8, 1] : vector<10x5xf32> return %1 : f32 } @@ -106,7 +106,7 @@ // CHECK: memref.store %[[extract]], %[[m]][%[[idx]], %[[idx]], %[[idx]]] func.func @transfer_write_arith_constant(%m: memref, %idx: index) { %cst = arith.constant dense<5.000000e+00> : vector<1x1xf32> - vector.transfer_write %cst, %m[%idx, %idx, %idx] : vector<1x1xf32>, memref + vector.transfer_write %cst, %m[%idx, %idx, %idx] {in_bounds = [true, true, true]} : vector<1x1xf32>, memref return } diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir --- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir @@ -202,9 +202,9 @@ %c0 = arith.constant 0 : index // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f16 %f0 = arith.constant 0. : f16 - // CHECK: %[[READ:.+]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]], %[[F0]] {in_bounds = [true]} : memref<1x4x8x16xf16>, vector<4xf16> + // CHECK: %[[READ:.+]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]], %[[F0]] {in_bounds = [true, true, true, true]} : memref<1x4x8x16xf16>, vector<4xf16> // CHECK: %[[CAST:.+]] = vector.broadcast %[[READ]] : vector<4xf16> to vector<1x4xf16> - %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true]} : memref<1x4x8x16xf16>, vector<1x4xf16> + %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true, true, true]} : memref<1x4x8x16xf16>, vector<1x4xf16> // CHECK: return %[[CAST]] return %0: vector<1x4xf16> } @@ -214,7 +214,7 @@ %c0 = arith.constant 0 : index %f0 = arith.constant 0. : f16 // CHECK: vector.broadcast %{{.+}} : vector<1xf16> to vector<1x1xf16> - %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true]} : memref<1x1x1x1xf16>, vector<1x1xf16> + %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %f0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16>, vector<1x1xf16> return %0: vector<1x1xf16> } @@ -223,9 +223,9 @@ // CHECK: %[[C0:.+]] = arith.constant 0 : index %c0 = arith.constant 0 : index // CHECK: %[[CAST:.+]] = vector.extract %{{.*}}[0] : vector<1x4xf16> - // CHECK: vector.transfer_write %[[CAST]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} : vector<4xf16>, memref<1x4x8x16xf16> + // CHECK: vector.transfer_write %[[CAST]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true, true]} : vector<4xf16>, memref<1x4x8x16xf16> - vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf16>, memref<1x4x8x16xf16> + vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x4xf16>, memref<1x4x8x16xf16> return } @@ -233,7 +233,7 @@ func.func @cast_away_transfer_write_leading_one_dims_one_element(%arg0: memref<1x1x1x1xf16>, %arg1: vector<1x1xf16>) { %c0 = arith.constant 0 : index // CHECK: vector.extract %{{.+}}[0] : vector<1x1xf16> - vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x1xf16>, memref<1x1x1x1xf16> + vector.transfer_write %arg1, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1xf16>, memref<1x1x1x1xf16> return } diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir --- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir @@ -3,7 +3,7 @@ func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ %c0 = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> + %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> return %0 : vector<1x8x1xf32> } // CHECK: func @contiguous_inner_most_view(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> diff --git a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir --- a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir @@ -4,7 +4,7 @@ %arg : memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>>) -> vector<3x2xi8> { %c0 = arith.constant 0 : index %cst = arith.constant 0 : i8 - %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0], %cst : + %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, false, false]} : memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>>, vector<3x2xi8> return %v : vector<3x2xi8> } @@ -16,7 +16,7 @@ func.func @transfer_write_rank_reducing(%arg : memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>>, %vec : vector<3x2xi8>) { %c0 = arith.constant 0 : index - vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0] : + vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false]} : vector<3x2xi8>, memref<1x1x3x2xi8, strided<[6, 6, 2, 1], offset: ?>> return } @@ -30,7 +30,7 @@ %arg : memref<1x1x3x2x1xf32>) -> vector<3x2x1xf32> { %c0 = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst : + %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, false, false, false]} : memref<1x1x3x2x1xf32>, vector<3x2x1xf32> return %v : vector<3x2x1xf32> } @@ -44,7 +44,7 @@ %arg : memref<1x1x3x2x1xf32>, %vec : vector<3x2x1xf32>) { %c0 = arith.constant 0 : index - vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] : + vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false, false]} : vector<3x2x1xf32>, memref<1x1x3x2x1xf32> return } @@ -58,7 +58,7 @@ %arg : memref<1x1x1x1x1xf32>) -> vector<1x1x1xf32> { %c0 = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst : + %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, false, false, false]} : memref<1x1x1x1x1xf32>, vector<1x1x1xf32> return %v : vector<1x1x1xf32> } @@ -72,7 +72,7 @@ %arg : memref<1x1x1x1x1xf32>, %vec : vector<1x1x1xf32>) { %c0 = arith.constant 0 : index - vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] : + vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false, false]} : vector<1x1x1xf32>, memref<1x1x1x1x1xf32> return } diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir --- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir @@ -65,7 +65,7 @@ func.func @transfer_read_flattenable_with_dynamic_dims_and_indices(%arg0 : memref>, %arg1 : index, %arg2 : index) -> vector<8x4xi8> { %c0_i8 = arith.constant 0 : i8 %c0 = arith.constant 0 : index - %result = vector.transfer_read %arg0[%arg1, %arg2, %c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref>, vector<8x4xi8> + %result = vector.transfer_read %arg0[%arg1, %arg2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true]} : memref>, vector<8x4xi8> return %result : vector<8x4xi8> } @@ -77,7 +77,7 @@ // CHECK-SAME: : memref into memref // CHECK: %[[VEC1D:.+]] = vector.transfer_read %[[COLLAPSED]] // CHECK-SAME: [%[[ARG1]], %[[ARG2]], %[[C0]]], %[[C0_I8]] -// CHECK-SAME: {in_bounds = [true]} +// CHECK-SAME: {in_bounds = [true, true, true]} // CHECK-SAME: : memref, vector<32xi8> // CHECK: %[[VEC2D:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8> // CHECK: return %[[VEC2D]] : vector<8x4xi8> @@ -86,7 +86,7 @@ func.func @transfer_write_flattenable_with_dynamic_dims_and_indices(%vec : vector<8x4xi8>, %dst : memref>, %arg1 : index, %arg2 : index) { %c0 = arith.constant 0 : index - vector.transfer_write %vec, %dst[%arg1, %arg2, %c0, %c0] {in_bounds = [true, true]} : vector<8x4xi8>, memref> + vector.transfer_write %vec, %dst[%arg1, %arg2, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<8x4xi8>, memref> return } @@ -98,7 +98,7 @@ // CHECK: %[[VEC1D:.+]] = vector.shape_cast %[[ARG0]] : vector<8x4xi8> to vector<32xi8> // CHECK: vector.transfer_write %[[VEC1D]], %[[COLLAPSED]] // CHECK-SAME: [%[[ARG2]], %[[ARG3]], %[[C0]]] -// CHECK-SAME: {in_bounds = [true]} +// CHECK-SAME: {in_bounds = [true, true, true]} // CHECK-SAME: : vector<32xi8>, memref // ----- diff --git a/mlir/test/Dialect/Vector/vector-transfer-materialize-masks.mlir b/mlir/test/Dialect/Vector/vector-transfer-materialize-masks.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Vector/vector-transfer-materialize-masks.mlir @@ -0,0 +1,33 @@ +// RUN: mlir-opt %s -test-transform-dialect-interpreter -split-input-file | FileCheck %s + +transform.sequence failures(propagate) { +^bb1(%func_op: !transform.op<"func.func">): + transform.apply_patterns to %func_op { + transform.apply_patterns.vector.materialize_masks + } : !transform.op<"func.func"> +} + +// CHECK-LABEL: func @mask_1d_transfer( +// CHECK: arith.cmpi +// CHECK: vector.transfer_read %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} {in_bounds = [true, true]} +func.func @mask_1d_transfer(%t: tensor, %i: index, %j: index) + -> vector<5xf32> +{ + %cst = arith.constant 5.5 : f32 + %0 = vector.transfer_read %t[%i, %j], %cst {in_bounds = [true, false]} + : tensor, vector<5xf32> + return %0 : vector<5xf32> +} + +// CHECK-LABEL: func @mask_2d_transfer( +// CHECK-NOT: arith.cmpi +// CHECK: vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {in_bounds = [true, false]} +func.func @mask_2d_transfer(%t: tensor, %i: index, %j: index) + -> vector<5x4xf32> +{ + %cst = arith.constant 5.5 : f32 + // Masks are currently not materialized for transfers that are 2D or higher. + %0 = vector.transfer_read %t[%i, %j], %cst {in_bounds = [true, false]} + : tensor, vector<5x4xf32> + return %0 : vector<5x4xf32> +} diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir --- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir @@ -12,7 +12,7 @@ %vf0 = vector.splat %fn1 : vector<7xf32> %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1> vector.transfer_write %vf0, %A[%base1, %base2], %mask - {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [false]} + {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [false, true]} : vector<7xf32>, memref return } diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir --- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir @@ -46,8 +46,8 @@ func.func @transfer_to_load(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> { %cf0 = arith.constant 0.0 : f32 - %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32> - vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true]} : vector<4xf32>, memref<8x8xf32> + %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref<8x8xf32>, vector<4xf32> + vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<4xf32>, memref<8x8xf32> return %res : vector<4xf32> } @@ -78,8 +78,8 @@ func.func @transfer_vector_element(%mem : memref<8x8xvector<2x4xf32>>, %i : index) -> vector<2x4xf32> { %cf0 = arith.constant dense<0.0> : vector<2x4xf32> - %res = vector.transfer_read %mem[%i, %i], %cf0 : memref<8x8xvector<2x4xf32>>, vector<2x4xf32> - vector.transfer_write %res, %mem[%i, %i] : vector<2x4xf32>, memref<8x8xvector<2x4xf32>> + %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]}: memref<8x8xvector<2x4xf32>>, vector<2x4xf32> + vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<2x4xf32>, memref<8x8xvector<2x4xf32>> return %res : vector<2x4xf32> } @@ -89,15 +89,15 @@ // CHECK-SAME: %[[MEM:.*]]: memref<8x8xvector<2x4xf32>>, // CHECK-SAME: %[[IDX:.*]]: index) -> vector<1x2x4xf32> { // CHECK-NEXT: %[[CF0:.*]] = arith.constant dense<0.000000e+00> : vector<2x4xf32> -// CHECK-NEXT: %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32> -// CHECK-NEXT: vector.transfer_write %[[RES:.*]], %[[MEM]][%[[IDX]], %[[IDX]]] {in_bounds = [true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>> +// CHECK-NEXT: %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true, true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32> +// CHECK-NEXT: vector.transfer_write %[[RES:.*]], %[[MEM]][%[[IDX]], %[[IDX]]] {in_bounds = [true, true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>> // CHECK-NEXT: return %[[RES]] : vector<1x2x4xf32> // CHECK-NEXT: } func.func @transfer_vector_element_different_types(%mem : memref<8x8xvector<2x4xf32>>, %i : index) -> vector<1x2x4xf32> { %cf0 = arith.constant dense<0.0> : vector<2x4xf32> - %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32> - vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>> + %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref<8x8xvector<2x4xf32>>, vector<1x2x4xf32> + vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<1x2x4xf32>, memref<8x8xvector<2x4xf32>> return %res : vector<1x2x4xf32> } @@ -125,15 +125,15 @@ // CHECK-SAME: %[[MEM:.*]]: memref<8x8xf32>, // CHECK-SAME: %[[IDX:.*]]: index) -> vector<4xf32> { // CHECK-NEXT: %[[CF0:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK-NEXT: %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] : memref<8x8xf32>, vector<4xf32> -// CHECK-NEXT: vector.transfer_write %[[RES]], %[[MEM]][%[[IDX]], %[[IDX]]] : vector<4xf32>, memref<8x8xf32> +// CHECK-NEXT: %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true, false]} : memref<8x8xf32>, vector<4xf32> +// CHECK-NEXT: vector.transfer_write %[[RES]], %[[MEM]][%[[IDX]], %[[IDX]]] {in_bounds = [true, false]} : vector<4xf32>, memref<8x8xf32> // CHECK-NEXT: return %[[RES]] : vector<4xf32> // CHECK-NEXT: } func.func @transfer_not_inbounds(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> { %cf0 = arith.constant 0.0 : f32 - %res = vector.transfer_read %mem[%i, %i], %cf0 : memref<8x8xf32>, vector<4xf32> - vector.transfer_write %res, %mem[%i, %i] : vector<4xf32>, memref<8x8xf32> + %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, false]} : memref<8x8xf32>, vector<4xf32> + vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, false]} : vector<4xf32>, memref<8x8xf32> return %res : vector<4xf32> } @@ -148,8 +148,8 @@ #layout = affine_map<(d0, d1) -> (d0*16 + d1)> func.func @transfer_nondefault_layout(%mem : memref<8x8xf32, #layout>, %i : index) -> vector<4xf32> { %cf0 = arith.constant 0.0 : f32 - %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref<8x8xf32, #layout>, vector<4xf32> - vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true]} : vector<4xf32>, memref<8x8xf32, #layout> + %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref<8x8xf32, #layout>, vector<4xf32> + vector.transfer_write %res, %mem[%i, %i] {in_bounds = [true, true]} : vector<4xf32>, memref<8x8xf32, #layout> return %res : vector<4xf32> } @@ -159,13 +159,13 @@ // CHECK-SAME: %[[MEM:.*]]: memref<8x8xf32>, // CHECK-SAME: %[[IDX:.*]]: index) -> vector<4xf32> { // CHECK-NEXT: %[[CF0:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK-NEXT: %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true], permutation_map = #{{.*}}} : memref<8x8xf32>, vector<4xf32> +// CHECK-NEXT: %[[RES:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]]], %[[CF0]] {in_bounds = [true, true], permutation_map = #{{.*}}} : memref<8x8xf32>, vector<4xf32> // CHECK-NEXT: return %[[RES]] : vector<4xf32> // CHECK-NEXT: } func.func @transfer_perm_map(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> { %cf0 = arith.constant 0.0 : f32 - %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<8x8xf32>, vector<4xf32> + %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<8x8xf32>, vector<4xf32> return %res : vector<4xf32> } @@ -183,7 +183,7 @@ func.func @transfer_broadcasting(%mem : memref<8x8xf32>, %i : index) -> vector<4xf32> { %cf0 = arith.constant 0.0 : f32 %res = vector.transfer_read %mem[%i, %i], %cf0 - {in_bounds = [true], permutation_map = #broadcast_1d} + {in_bounds = [true, true], permutation_map = #broadcast_1d} : memref<8x8xf32>, vector<4xf32> return %res : vector<4xf32> } @@ -197,7 +197,7 @@ // CHECK-NEXT: } func.func @transfer_scalar(%mem : memref, %i : index) -> vector<1xf32> { %cf0 = arith.constant 0.0 : f32 - %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true]} : memref, vector<1xf32> + %res = vector.transfer_read %mem[%i, %i], %cf0 {in_bounds = [true, true]} : memref, vector<1xf32> return %res : vector<1xf32> } @@ -232,7 +232,7 @@ func.func @transfer_broadcasting_complex(%mem : memref<10x20x30x8x8xf32>, %i : index) -> vector<3x2x4x5xf32> { %cf0 = arith.constant 0.0 : f32 %res = vector.transfer_read %mem[%i, %i, %i, %i, %i], %cf0 - {in_bounds = [true, true, true, true], permutation_map = #broadcast_2d_in_4d} + {in_bounds = [true, true, true, true, true], permutation_map = #broadcast_2d_in_4d} : memref<10x20x30x8x8xf32>, vector<3x2x4x5xf32> return %res : vector<3x2x4x5xf32> } @@ -271,19 +271,19 @@ // CHECK: %[[MASK0:.*]] = vector.splat %{{.*}} : vector<14x7xi1> %mask0 = vector.splat %m : vector<14x7xi1> %0 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask0 {in_bounds = [true, false, true, true], permutation_map = #map0} : memref, vector<7x14x8x16xf32> -// CHECK: vector.transfer_read {{.*}} %[[MASK0]] {in_bounds = [false, true, true, true], permutation_map = #[[$MAP0]]} : memref, vector<14x7x8x16xf32> +// CHECK: vector.transfer_read {{.*}} %[[MASK0]] {in_bounds = [true, false, true, true], permutation_map = #[[$MAP0]]} : memref, vector<14x7x8x16xf32> // CHECK: vector.transpose %{{.*}}, [1, 0, 2, 3] : vector<14x7x8x16xf32> to vector<7x14x8x16xf32> // CHECK: %[[MASK1:.*]] = vector.splat %{{.*}} : vector<16x14xi1> %mask1 = vector.splat %m : vector<16x14xi1> - %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask1 {permutation_map = #map1} : memref, vector<7x14x8x16xf32> -// CHECK: vector.transfer_read {{.*}} %[[MASK1]] {permutation_map = #[[$MAP0]]} : memref, vector<16x14x7x8xf32> + %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask1 {permutation_map = #map1, in_bounds = [false, false, true, true]} : memref, vector<7x14x8x16xf32> +// CHECK: vector.transfer_read {{.*}} %[[MASK1]] {in_bounds = [false, false, true, true], permutation_map = #[[$MAP0]]} : memref, vector<16x14x7x8xf32> // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32> // CHECK: %[[MASK3:.*]] = vector.splat %{{.*}} : vector<14x7xi1> %mask2 = vector.splat %m : vector<14x7xi1> %2 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask2 {in_bounds = [true, false, true, true], permutation_map = #map2} : memref, vector<7x14x8x16xf32> -// CHECK: vector.transfer_read {{.*}} %[[MASK3]] {in_bounds = [false, true, true], permutation_map = #[[$MAP1]]} : memref, vector<14x16x7xf32> +// CHECK: vector.transfer_read {{.*}} %[[MASK3]] {in_bounds = [true, false, true, true], permutation_map = #[[$MAP1]]} : memref, vector<14x16x7xf32> // CHECK: vector.broadcast %{{.*}} : vector<14x16x7xf32> to vector<8x14x16x7xf32> // CHECK: vector.transpose %{{.*}}, [3, 1, 0, 2] : vector<8x14x16x7xf32> to vector<7x14x8x16xf32> @@ -301,7 +301,7 @@ // CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]], %[[C0]]], %[[CF0]] : memref, vector<16x14x7x8xf32> // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32> - %6 = vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = #map6} : memref, vector<8xf32> + %6 = vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = #map6, in_bounds = [true, true]} : memref, vector<8xf32> // CHECK: memref.load %{{.*}}[%[[C0]], %[[C0]]] : memref // CHECK: vector.broadcast %{{.*}} : f32 to vector<8xf32> @@ -326,11 +326,11 @@ %mask0 = vector.splat %m : vector<16x14x7x8xi1> %0 = vector.transfer_write %v1, %arg1[%c0, %c0, %c0, %c0], %mask0 {in_bounds = [true, false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)>} : vector<7x14x8x16xf32>, tensor // CHECK: %[[NEW_VEC0:.*]] = vector.transpose %{{.*}} [3, 1, 0, 2] : vector<7x14x8x16xf32> to vector<16x14x7x8xf32> - // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC0]], %[[ARG1]][%c0, %c0, %c0, %c0], %[[MASK]] {in_bounds = [true, false, true, false]} : vector<16x14x7x8xf32>, tensor + // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC0]], %[[ARG1]][%c0, %c0, %c0, %c0], %[[MASK]] {in_bounds = [true, false, false, true]} : vector<16x14x7x8xf32>, tensor - vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} : vector<8x16xf32>, memref + vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} : vector<8x16xf32>, memref // CHECK: %[[NEW_VEC1:.*]] = vector.transpose %{{.*}} [1, 0] : vector<8x16xf32> to vector<16x8xf32> - // CHECK: vector.transfer_write %[[NEW_VEC1]], %[[ARG0]][%c0, %c0, %c0, %c0] : vector<16x8xf32>, memref + // CHECK: vector.transfer_write %[[NEW_VEC1]], %[[ARG0]][%c0, %c0, %c0, %c0] {in_bounds = [true, true, false, false]} : vector<16x8xf32>, memref return %0 : tensor } @@ -347,15 +347,15 @@ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index %c0 = arith.constant 0 : index - %0 = vector.transfer_write %v1, %arg1[%c0, %c0, %c0, %c0] {in_bounds = [false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>} : vector<14x8x16xf32>, tensor + %0 = vector.transfer_write %v1, %arg1[%c0, %c0, %c0, %c0] {in_bounds = [false, false, true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>} : vector<14x8x16xf32>, tensor // CHECK: %[[NEW_VEC0:.*]] = vector.broadcast %{{.*}} : vector<14x8x16xf32> to vector<1x14x8x16xf32> // CHECK: %[[NEW_VEC1:.*]] = vector.transpose %[[NEW_VEC0]], [1, 2, 0, 3] : vector<1x14x8x16xf32> to vector<14x8x1x16xf32> // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC1]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, false, true, true]} : vector<14x8x1x16xf32>, tensor - vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>} : vector<8x16xf32>, memref + vector.transfer_write %v2, %arg0[%c0, %c0, %c0, %c0] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>, in_bounds = [true, false, false, true]} : vector<8x16xf32>, memref // CHECK: %[[NEW_VEC2:.*]] = vector.broadcast %{{.*}} : vector<8x16xf32> to vector<1x8x16xf32> // CHECK: %[[NEW_VEC3:.*]] = vector.transpose %[[NEW_VEC2]], [1, 2, 0] : vector<1x8x16xf32> to vector<8x16x1xf32> - // CHECK: vector.transfer_write %[[NEW_VEC3]], %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [false, false, true]} : vector<8x16x1xf32>, memref + // CHECK: vector.transfer_write %[[NEW_VEC3]], %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, false, false, true]} : vector<8x16x1xf32>, memref return %0 : tensor } diff --git a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir --- a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir @@ -199,7 +199,7 @@ func.func @transfer_read_unroll_broadcast(%arg0 : memref<6x4xf32>) -> vector<6x4xf32> { %c0 = arith.constant 0 : index %cf0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32> + %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32> return %0 : vector<6x4xf32> } @@ -226,7 +226,7 @@ func.func @transfer_read_unroll_broadcast_permuation(%arg0 : memref<6x4xf32>) -> vector<4x6xf32> { %c0 = arith.constant 0 : index %cf0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32> + %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [false, true], permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32> return %0 : vector<4x6xf32> } @@ -272,7 +272,7 @@ func.func @transfer_read_unroll_different_rank(%arg0 : memref) -> vector<6x4xf32> { %c0 = arith.constant 0 : index %cf0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %arg0[%c0, %c0, %c0], %cf0 {permutation_map = #map0} : memref, vector<6x4xf32> + %0 = vector.transfer_read %arg0[%c0, %c0, %c0], %cf0 {in_bounds = [false, true, false], permutation_map = #map0} : memref, vector<6x4xf32> return %0 : vector<6x4xf32> } diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -127,7 +127,7 @@ %v = "test.dummy_op"() : () -> (vector<1xf32>) %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>) vector.transfer_write %v1, %arg1[%c0, %c0] : vector<1x1xf32>, memref<1024x1024xf32> - vector.transfer_write %v, %arg1[%c0, %c0] : vector<1xf32>, memref<1024x1024xf32> + vector.transfer_write %v, %arg1[%c0, %c0] {in_bounds = [true, false]}: vector<1xf32>, memref<1024x1024xf32> } return } @@ -535,7 +535,7 @@ // CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: vector.transfer_write %[[R]], %{{.*}}[] : vector, memref vector.warp_execute_on_lane_0(%laneid)[32] { - %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32> + %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true, true, true]} : memref<4x2x32xf32>, vector<32xf32> %1 = vector.transfer_read %m1[], %f0 : memref, vector %2 = vector.extractelement %1[] : vector %3 = vector.reduction , %0 : vector<32xf32> into f32 @@ -1147,7 +1147,7 @@ %29 = vector.extract %28[0] : vector<1x64xi32> %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex> %36 = vector.extractelement %30[%c0_i32 : i32] : vector<64xindex> - %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32> + %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true, true, true]} : memref<1x4x1024xf32>, vector<64xf32> vector.yield %37 : vector<64xf32> } return %18 : vector<2xf32> diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir @@ -159,11 +159,11 @@ // %f0 = arith.constant 0.0 : f32 scf.for %i = %c0 to %c8 step %c1 { - %v1 = vector.transfer_read %C1[%i, %c0], %f0 + %v1 = vector.transfer_read %C1[%i, %c0], %f0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8xf32> - %v2 = vector.transfer_read %C2[%i, %c0], %f0 + %v2 = vector.transfer_read %C2[%i, %c0], %f0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8xf32> - %v3 = vector.transfer_read %C3[%i, %c0], %f0 + %v3 = vector.transfer_read %C3[%i, %c0], %f0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8xf32> vector.print %v1 : vector<8xf32> vector.print %v2 : vector<8xf32> diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir @@ -124,7 +124,7 @@ // CHECK: ( 0, 520, 0, 0, 1250 ) // scf.for %i = %c0 to %c5 step %c1 { - %v = vector.transfer_read %0[%i, %c0], %d0: tensor, vector<5xf32> + %v = vector.transfer_read %0[%i, %c0], %d0 {in_bounds = [true, true]}: tensor, vector<5xf32> vector.print %v : vector<5xf32> } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir @@ -124,12 +124,12 @@ // %x = sparse_tensor.convert %0 : tensor<4x3xf64, #DCSR> to tensor<4x3xf64> scf.for %i = %c0 to %c4 step %c1 { - %v1 = vector.transfer_read %x[%i, %c0], %du: tensor<4x3xf64>, vector<3xf64> + %v1 = vector.transfer_read %x[%i, %c0], %du {in_bounds = [true, true]} : tensor<4x3xf64>, vector<3xf64> vector.print %v1 : vector<3xf64> } %y = sparse_tensor.convert %1 : tensor<4x3xf64, #DCSR> to tensor<4x3xf64> scf.for %i = %c0 to %c4 step %c1 { - %v2 = vector.transfer_read %y[%i, %c0], %du: tensor<4x3xf64>, vector<3xf64> + %v2 = vector.transfer_read %y[%i, %c0], %du {in_bounds = [true, true]} : tensor<4x3xf64>, vector<3xf64> vector.print %v2 : vector<3xf64> } diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir @@ -65,21 +65,21 @@ %c5 = arith.constant 5 : index %c6 = arith.constant 6 : index %c7 = arith.constant 7 : index - %r0 = vector.transfer_read %mat[%c0,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r0 = vector.transfer_read %mat[%c0,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r0 : vector<8xf32> - %r1 = vector.transfer_read %mat[%c1,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r1 = vector.transfer_read %mat[%c1,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r1 : vector<8xf32> - %r2 = vector.transfer_read %mat[%c2,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r2 = vector.transfer_read %mat[%c2,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r2 : vector<8xf32> - %r3 = vector.transfer_read %mat[%c3,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r3 = vector.transfer_read %mat[%c3,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r3 : vector<8xf32> - %r4 = vector.transfer_read %mat[%c4,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r4 = vector.transfer_read %mat[%c4,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r4 : vector<8xf32> - %r5 = vector.transfer_read %mat[%c5,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r5 = vector.transfer_read %mat[%c5,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r5 : vector<8xf32> - %r6 = vector.transfer_read %mat[%c6,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r6 = vector.transfer_read %mat[%c6,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r6 : vector<8xf32> - %r7 = vector.transfer_read %mat[%c7,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + %r7 = vector.transfer_read %mat[%c7,%c0], %f0 {in_bounds = [true, false]} : tensor<8x8xf32>, vector<8xf32> vector.print %r7 : vector<8xf32> return } diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir @@ -130,18 +130,18 @@ %quad_col = affine.apply affine_map<()[s0]->(s0 * 2)>()[%col_8x4] // account for 2xf16/col // Load quad (0, 0) - %A_quad00 = vector.transfer_read %argA[%quad_row, %quad_col], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16> + %A_quad00 = vector.transfer_read %argA[%quad_row, %quad_col], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16> // Load quad (1, 0). Just shift row down 8. %quad_row_plus_8 = affine.apply affine_map<(d0)[]->(d0+8)>(%quad_row)[] - %A_quad10 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16> + %A_quad10 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16> // Load quad (0, 1). Just shift col right 8 (4 2xf16 values) %quad_col_plus_8 = affine.apply affine_map<(d0)[]->(d0+8)>(%quad_col)[] - %A_quad01 = vector.transfer_read %argA[%quad_row, %quad_col_plus_8], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16> + %A_quad01 = vector.transfer_read %argA[%quad_row, %quad_col_plus_8], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16> // Load quad (1, 1) - %A_quad11 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col_plus_8], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16> + %A_quad11 = vector.transfer_read %argA[%quad_row_plus_8, %quad_col_plus_8], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<2xf16> // Assemble the elements into a vector %A_init0 = arith.constant dense<0.0> : vector<4x2xf16> @@ -166,18 +166,18 @@ // (t) -> (t/4, t % 4). So we can re-use some of the calculation from A. // Load quad (0, 0) - %B_quad0 = vector.transfer_read %argB[%quad_row, %quad_col], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16> + %B_quad0 = vector.transfer_read %argB[%quad_row, %quad_col], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16> // Load quad (0, 1) - %B_quad1 = vector.transfer_read %argB[%quad_row, %quad_col_plus_8], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16> + %B_quad1 = vector.transfer_read %argB[%quad_row, %quad_col_plus_8], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16> // Load quad (0, 2) %quad_col_plus_16 = affine.apply affine_map<()[s0]->(s0 + 16)>()[%quad_col] - %B_quad2 = vector.transfer_read %argB[%quad_row, %quad_col_plus_16], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16> + %B_quad2 = vector.transfer_read %argB[%quad_row, %quad_col_plus_16], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16> // Load quad (0, 3) %quad_col_plus_24 = affine.apply affine_map<()[s0]->(s0 + 24)>()[%quad_col] - %B_quad3 = vector.transfer_read %argB[%quad_row, %quad_col_plus_24], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16> + %B_quad3 = vector.transfer_read %argB[%quad_row, %quad_col_plus_24], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<2xf16> // Assemble into vector %B_init0 = arith.constant dense<0.0> : vector<4x2xf16> @@ -208,8 +208,8 @@ // vector1: (tid) -> (tid / 4 + 8, tid %4) %C_0 = vector.extract %d[0] : vector<2x2xf16> %C_1 = vector.extract %d[1] : vector<2x2xf16> - vector.transfer_write %C_0, %argC[%quad_row, %quad_col] {in_bounds = [true]} : vector<2xf16>, memref<16x8xf16> - vector.transfer_write %C_1, %argC[%quad_row_plus_8, %quad_col] {in_bounds = [true]} : vector<2xf16>, memref<16x8xf16> + vector.transfer_write %C_0, %argC[%quad_row, %quad_col] {in_bounds = [true, true]} : vector<2xf16>, memref<16x8xf16> + vector.transfer_write %C_1, %argC[%quad_row_plus_8, %quad_col] {in_bounds = [true, true]} : vector<2xf16>, memref<16x8xf16> gpu.return } @@ -319,7 +319,7 @@ // CHECK-NEXT: ( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ) // scf.for %pai = %c0 to %c16 step %c1 { - %pa0 = vector.transfer_read %a[%pai, %c0], %f0 : memref<16x16xf16>, vector<16xf16> + %pa0 = vector.transfer_read %a[%pai, %c0], %f0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16xf16> vector.print %pa0 : vector<16xf16> } @@ -338,7 +338,7 @@ // // scf.for %pbi = %c0 to %c8 step %c1 { - %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 : memref<8x32xf16>, vector<32xf16> + %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 {in_bounds = [true, true]} : memref<8x32xf16>, vector<32xf16> vector.print %pb0 : vector<32xf16> } @@ -395,7 +395,7 @@ // CHECK-NEXT: ( -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688 ) // scf.for %pci = %c0 to %c16 step %c1 { - %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x8xf16>, vector<8xf16> + %pc0 = vector.transfer_read %c[%pci, %c0], %f0 {in_bounds = [true, true]} : memref<16x8xf16>, vector<8xf16> vector.print %pc0 : vector<8xf16> } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir @@ -21,7 +21,7 @@ func.func @transfer_read_1d(%A : memref, %base1 : index, %base2 : index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%base1, %base2], %fm42 - {permutation_map = affine_map<(d0, d1) -> (d0)>} + {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]} : memref, vector<9xf32> vector.print %f: vector<9xf32> return @@ -41,7 +41,7 @@ scf.for %arg3 = %c0 to %c6 step %c3 { %0 = memref.subview %A[%arg2, %arg3] [1, 2] [1, 1] : memref to memref<1x2xf32, strided<[?, 1], offset: ?>> - %1 = vector.transfer_read %0[%c0, %c0], %fm42 {in_bounds=[true]} + %1 = vector.transfer_read %0[%c0, %c0], %fm42 {in_bounds=[true, true]} : memref<1x2xf32, strided<[?, 1], offset: ?>>, vector<2xf32> vector.print %1 : vector<2xf32> } @@ -59,7 +59,7 @@ %fm42 = arith.constant -42.0: f32 %1 = memref.reinterpret_cast %A to offset: [%c6], sizes: [%c4, %c6], strides: [%c6, %c1] : memref to memref> - %2 = vector.transfer_read %1[%c2, %c1], %fm42 {in_bounds=[true]} + %2 = vector.transfer_read %1[%c2, %c1], %fm42 {in_bounds=[true, true]} : memref>, vector<4xf32> vector.print %2 : vector<4xf32> return @@ -72,7 +72,7 @@ %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %fm42 = arith.constant -42.0: f32 - %vec = vector.transfer_read %B[%c2, %c1], %fm42 {in_bounds=[false]} : memref<4x3xf32, strided<[6, 2]>>, vector<3xf32> + %vec = vector.transfer_read %B[%c2, %c1], %fm42 {in_bounds=[true, false]} : memref<4x3xf32, strided<[6, 2]>>, vector<3xf32> vector.print %vec : vector<3xf32> return } @@ -82,7 +82,7 @@ %A : memref, %base1 : index, %base2 : index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%base1, %base2], %fm42 - {permutation_map = affine_map<(d0, d1) -> (0)>} + {permutation_map = affine_map<(d0, d1) -> (0)>, in_bounds=[true, true]} : memref, vector<9xf32> vector.print %f: vector<9xf32> return @@ -93,7 +93,7 @@ %A : memref, %base1 : index, %base2 : index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%base1, %base2], %fm42 - {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} + {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true, true]} : memref, vector<3xf32> vector.print %f: vector<3xf32> return @@ -105,7 +105,7 @@ %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1, 0, 1]> : vector<9xi1> %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask - {permutation_map = affine_map<(d0, d1) -> (d0)>} + {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]} : memref, vector<9xf32> vector.print %f: vector<9xf32> return @@ -117,7 +117,7 @@ %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[1, 0, 1]> : vector<3xi1> %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask - {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} + {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true, true]} : memref, vector<3xf32> vector.print %f: vector<3xf32> return @@ -128,7 +128,7 @@ %fn1 = arith.constant -1.0 : f32 %vf0 = vector.splat %fn1 : vector<7xf32> vector.transfer_write %vf0, %A[%base1, %base2] - {permutation_map = affine_map<(d0, d1) -> (d0)>} + {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]} : vector<7xf32>, memref return } @@ -139,7 +139,7 @@ %vf0 = vector.splat %fn1 : vector<7xf32> %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1> vector.transfer_write %vf0, %A[%base1, %base2], %mask - {permutation_map = affine_map<(d0, d1) -> (d0)>} + {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds=[false, true]} : vector<7xf32>, memref return } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir @@ -57,7 +57,8 @@ %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1, 0, 1]> : vector<9xi1> %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask - {permutation_map = affine_map<(d0, d1) -> (0, d1)>} : + {permutation_map = affine_map<(d0, d1) -> (0, d1)>, + in_bounds = [true, false]} : memref, vector<4x9xf32> vector.print %f: vector<4x9xf32> return @@ -69,7 +70,8 @@ %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[1, 0, 1, 1]> : vector<4xi1> %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask - {permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : + {permutation_map = affine_map<(d0, d1) -> (d1, 0)>, + in_bounds = [true, false]} : memref, vector<4x9xf32> vector.print %f: vector<4x9xf32> return @@ -91,7 +93,8 @@ %A : memref, %base1: index, %base2: index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%base1, %base2], %fm42 - {permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : + {permutation_map = affine_map<(d0, d1) -> (d1, 0)>, + in_bounds = [true, false]} : memref, vector<4x9xf32> vector.print %f: vector<4x9xf32> return diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir @@ -9,30 +9,32 @@ // RUN: FileCheck %s func.func @transfer_read_3d(%A : memref, - %o: index, %a: index, %b: index, %c: index) { + %o: index, %a: index, %b: index, %c: index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42 + {in_bounds = [true, false, false, false]} : memref, vector<2x5x3xf32> vector.print %f: vector<2x5x3xf32> return } -func.func @transfer_read_3d_and_extract(%A : memref, - %o: index, %a: index, %b: index, %c: index) { +func.func @transfer_read_3d_and_extract(%A : memref, %o: index, + %a: index, %b: index, %c: index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42 - {in_bounds = [true, true, true]} + {in_bounds = [true, true, true, true]} : memref, vector<2x5x3xf32> %sub = vector.extract %f[0] : vector<2x5x3xf32> vector.print %sub: vector<5x3xf32> return } -func.func @transfer_read_3d_broadcast(%A : memref, - %o: index, %a: index, %b: index, %c: index) { +func.func @transfer_read_3d_broadcast(%A : memref, %o: index, + %a: index, %b: index, %c: index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42 - {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)>} + {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)>, + in_bounds = [true, false, true, false]} : memref, vector<2x5x3xf32> vector.print %f: vector<2x5x3xf32> return @@ -43,27 +45,30 @@ %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[0, 1]> : vector<2xi1> %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42, %mask - {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, 0)>} + {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, 0)>, + in_bounds = [true, false, true, true]} : memref, vector<2x5x3xf32> vector.print %f: vector<2x5x3xf32> return } -func.func @transfer_read_3d_transposed(%A : memref, - %o: index, %a: index, %b: index, %c: index) { +func.func @transfer_read_3d_transposed(%A : memref, %o: index, + %a: index, %b: index, %c: index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42 - {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1)>} + {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1)>, + in_bounds = [false, false, true, false]} : memref, vector<3x5x3xf32> vector.print %f: vector<3x5x3xf32> return } -func.func @transfer_write_3d(%A : memref, - %o: index, %a: index, %b: index, %c: index) { +func.func @transfer_write_3d(%A : memref, %o: index, %a: index, + %b: index, %c: index) { %fn1 = arith.constant -1.0 : f32 %vf0 = vector.splat %fn1 : vector<2x9x3xf32> vector.transfer_write %vf0, %A[%o, %a, %b, %c] + {in_bounds = [true, false, false, false]} : vector<2x9x3xf32>, memref return } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir @@ -101,7 +101,7 @@ // CHECK-SAME: ( -42, -42, -42, -42, -42 ), // CHECK-SAME: ( -42, -42, -42, -42, -42 ) ) - %5 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map1} : memref, vector<5xf32> + %5 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map1, in_bounds = [true, false]} : memref, vector<5xf32> vector.print %5 : vector<5xf32> // CHECK-NEXT: ( 403, 503, 502, -42, -42 )