diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -636,19 +636,21 @@ Gather }; -/// Check whether /p val can be used for calculating an index for a contiguous -/// load operation. This means that /p val should either: -/// * be invariant with respect to /p linalgOp, or -/// * increment by 1 with every loop iterator (when /p shouldBeConstant is -/// false). -/// Parameters /p trailingLoopDim and /p shouldBeConstant are used to analyze -/// `linalg.index` ops. -static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val, - size_t trailingLoopDim, bool shouldBeConstant) { - auto *block = linalgOp.getBlock(); +/// Checks whether /p val can be used for calculating a loop invariant index. +static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val) { - // Bail out if this is a block argument for this linalg.generic Op. + auto targetShape = linalgOp.getStaticLoopRanges(); + assert(((llvm::count_if(targetShape, + [](int64_t dimSize) { return dimSize > 1; }) == 1)) && + "n-D vectors are not yet supported"); + assert(targetShape.back() != 1 && + "1-D vectors with the trailing dim eqaual 1 are not yet supported"); + + // Blocks outside _this_ linalg.generic are effectively loop invariant. + // However, analysing block arguments for _this_ linalg.generic Op is a bit + // tricky. Just bail out in the latter case. // TODO: We could try analysing the corresponding affine map here. + auto *block = linalgOp.getBlock(); if (val.dyn_cast()) return llvm::all_of(block->getArguments(), [&val](Value v) { return (v != val); }); @@ -656,61 +658,90 @@ Operation *defOp = val.getDefiningOp(); assert(defOp && "This is neither a block argument nor an operation result"); - // We know that we are reading into a 1-D tensor like this: - // `tensor<1x1x4xi32`. Given this assumption, the following Op: - // * `%idx = `linalg.index dim : index`, - // will either: - // 1. produce a constant when `dim` _is not_ the trailing loop dim, or - // 2. increment with stride one when `dim` _is_ the trailing loop dim. + // IndexOp is loop invariant as long as its result remains constant across + // iterations. Given the assumptions on the loop ranges above, only the + // trailing loop dim ever changes. + auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1; if (auto indexOp = dyn_cast(defOp)) - return shouldBeConstant ? (indexOp.getDim() != trailingLoopDim) - : (indexOp.getDim() == trailingLoopDim); + return (indexOp.getDim() != trailingLoopDim); auto *ancestor = block->findAncestorOpInBlock(*defOp); - // Values define outside `linalgOp`. + // Values define outside `linalgOp` are loop invariant. if (!ancestor) return true; - // Values defined inside `linalgOp`, which are constant. + // Values defined inside `linalgOp`, which are constant, are loop invariant. if (dyn_cast(ancestor)) return true; - // Conservatively reject Ops that could lead to non-contiguous accesses. - if (!isa(ancestor)) - return false; - bool result = true; for (auto op : ancestor->getOperands()) - result &= - isContiguousLoadIdx(linalgOp, op, trailingLoopDim, shouldBeConstant); + result &= isLoopInvariantIdx(linalgOp, op); return result; } -/// Check whether the calculation of \p val is based on linalg.index Op with -/// the dim attribute matching \p dim. -static bool isBasedOnIndexOp(LinalgOp &linalgOp, Value &val, size_t dim) { - auto *block = linalgOp.getBlock(); - auto targetShape = linalgOp.getStaticLoopRanges(); +/// Check whether \p val could be used for calculating the trailing index for a +/// contiguous load operation. +/// +/// There are currently 3 types of values that are allowed here: +/// 1. loop-invariant values, +/// 2. values that increment by 1 with every loop iteration, +/// 3. results of basic arithmetic operations (linear and continuous) +/// involving 1., 2. and 3. +/// This method returns True if indeed only such values are used in calculating +/// \p val. +/// +/// Additionally, the trailing index for a contiguous load operation should +/// increment by 1 with every loop iteration, i.e. be based on: +/// * `linalg.index ` , +/// where is the trailing dim of the iteration space. \p foundIndexOp is +/// updated to `true` when such an op is found. +static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val, + bool &foundIndexOp) { - if (val.isa()) - return false; + auto targetShape = linalgOp.getStaticLoopRanges(); + assert(((llvm::count_if(targetShape, + [](int64_t dimSize) { return dimSize > 1; }) == 1)) && + "n-D vectors are not yet supported are not yet supported"); + assert(targetShape.back() != 1 && + "1-D vectors with the trailing dim 1 are not yet supported"); + + // Blocks outside _this_ linalg.generic are effectively loop invariant. + // However, analysing block arguments for _this_ linalg.generic Op is a bit + // tricky. Just bail out in the latter case. + // TODO: We could try analysing the corresponding affine map here. + auto *block = linalgOp.getBlock(); + if (val.dyn_cast()) + return llvm::all_of(block->getArguments(), + [&val](Value v) { return (v != val); }); Operation *defOp = val.getDefiningOp(); assert(defOp && "This is neither a block argument nor an operation result"); - if (auto indexOp = dyn_cast(defOp)) - return (indexOp.getDim() == dim); + // Given the assumption on the loop ranges above, only the trailing loop + // index is not constant. + auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1; + if (auto indexOp = dyn_cast(defOp)) { + foundIndexOp = (indexOp.getDim() == trailingLoopDim); + return true; + } auto *ancestor = block->findAncestorOpInBlock(*defOp); if (!ancestor) return false; + // Conservatively reject Ops that could lead to indices with stride other + // than 1. + if (!isa( + ancestor)) + return false; + bool result = false; for (auto op : ancestor->getOperands()) - result |= isBasedOnIndexOp(linalgOp, op, dim); + result |= isContiguousLoadIdx(linalgOp, op, foundIndexOp); return result; } @@ -725,7 +756,7 @@ auto targetShape = linalgOp.getStaticLoopRanges(); - // Assume that it's a gather load when reading _into_: + // 1. Assume that it's a gather load when reading _into_: // * an n-D vector, like`tensor<1x2x4xi32` or`tensor<2x1x4xi32>`, or // * a 1-D vector with the trailing dim equal 1, e.g. `tensor<1x4x1xi32`. // TODO: Relax these conditions. @@ -736,44 +767,36 @@ auto inputShape = extractOp.getTensor().getType().cast(); - // Assume that it's a gather load when reading _from_ a tensor for which the - // trailing dimension is 1, e.g. `tensor<1x4x1xi32>`. + // 2. Assume that it's a gather load when reading _from_ a tensor for which + // the trailing dimension is 1, e.g. `tensor<1x4x1xi32>`. // TODO: Relax this condition. if (inputShape.getShape().back() == 1) return VectorMemoryAccessKind::Gather; - // The trailing loop dim is needed when analyzing ops like: - // * %idx = `linalg.index : index`. - auto trailingLoopDim = targetShape.size() - 1; - bool isContiguous = true; - // Iterate over all indices. Analyze the way each index is calculated and - // decide whether it is suitable for a contiguous load (e.g. loop invariant). + // 3a. Analyze the leading indices of `extractOp`. + // Look at the way each index is calculated and decide whether it is suitable + // for a contiguous load, i.e. whether it's loop invariant. auto indices = extractOp.getIndices(); - for (auto [i, indexVal] : llvm::enumerate(indices)) { - if (inputShape.getShape()[i] == 1) { - // This index will always be equal 0, so it is a loop-invariant constant. - continue; - } + auto leadIndices = ValueRange(indices.drop_back(1)); - // Should this index be loop invariant? - // * _no_ if this is the trailing index, - // * _yes_ otherwise. - auto extractOpBottomIdx = indices.size() - 1; - bool loopInvariantIndex = (i != extractOpBottomIdx); + for (auto [i, indexVal] : llvm::enumerate(leadIndices)) { + if (inputShape.getShape()[i] == 1) + continue; - isContiguous &= isContiguousLoadIdx(linalgOp, indexVal, trailingLoopDim, - loopInvariantIndex); + isContiguous &= isLoopInvariantIdx(linalgOp, indexVal); } - // The trailing index in the extract Op must increment with every iteration, - // which means that it must be based on a loop index. Given the assumption - // on the output tensor, only the trailing loop index is not constant, so - // that's what we need to check against. + // 3b. Analyze the trailing index for `extractOp`. auto extractOpTrailingIdx = indices.back(); + // For contiguous loads, the trailing `extractOp` index should increment with + // every loop iteration. This effectively means that it must be based on the + // trailing loop index. This is what the following bool captures. + bool foundIndexOp = false; isContiguous &= - isBasedOnIndexOp(linalgOp, extractOpTrailingIdx, trailingLoopDim); + isContiguousLoadIdx(linalgOp, extractOpTrailingIdx, foundIndexOp); + isContiguous &= foundIndexOp; if (isContiguous) { LDBG("Found contigous load: " << extractOp); diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -1783,6 +1783,211 @@ // CHECK: %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32> // CHECK: return %[[VAL_14]] : tensor<5xf32> +transform.sequence failures(propagate) { + ^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation + %2 = transform.structured.vectorize %1 { vectorize_nd_extract } + } + +// ----- + +// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load. +func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { + %c79 = arith.constant 79 : index + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } outs(%extracted_slice : tensor<1x4xf32>) { + ^bb0(%out: f32): + %2 = linalg.index 1 : index + %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0) + %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32> + linalg.yield %extracted : f32 + } -> tensor<1x4xf32> + return %1 : tensor<1x4xf32> +} + +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: index, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { +// CHECK: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_7:.*]] = arith.constant 79 : index +// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex> +// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex> +// CHECK: %[[VAL_10:.*]] = vector.extractelement %[[VAL_9]]{{\[}}%[[VAL_4]] : i32] : vector<4xindex> +// CHECK: %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> +// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> +// CHECK: return %[[VAL_12]] : tensor<1x4xf32> +// CHECK: } + +transform.sequence failures(propagate) { + ^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation + %2 = transform.structured.vectorize %1 { vectorize_nd_extract } + } + +// ----- + +// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load. +func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { + %c16 = arith.constant 16 : index + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } outs(%extracted_slice : tensor<1x4xf32>) { + ^bb0(%out: f32): + %2 = linalg.index 1 : index + %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0) + %extracted = tensor.extract %6[%3, %c16] : tensor<80x16xf32> + linalg.yield %extracted : f32 + } -> tensor<1x4xf32> + return %1 : tensor<1x4xf32> +} + +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_gather( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: index, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { +// CHECK: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> +// CHECK: %[[VAL_4:.*]] = arith.constant dense : vector<1x4xi1> +// CHECK: %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32> +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_7:.*]] = arith.constant dense<16> : vector<1x4xindex> +// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex> +// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex> +// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : vector<4xindex> to vector<1x4xindex> +// CHECK: %[[VAL_11:.*]] = arith.muli %[[VAL_10]], %[[VAL_7]] : vector<1x4xindex> +// CHECK: %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_7]] : vector<1x4xindex> +// CHECK: %[[VAL_13:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_12]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32> +// CHECK: %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> +// CHECK: return %[[VAL_14]] : tensor<1x4xf32> +// CHECK: } + +transform.sequence failures(propagate) { + ^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation + %2 = transform.structured.vectorize %1 { vectorize_nd_extract } + } + +// ----- + +// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load. +func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { + %c79 = arith.constant 79 : index + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } outs(%extracted_slice : tensor<1x4xf32>) { + ^bb0(%out: f32): + %2 = linalg.index 1 : index + %3 = arith.maxsi %2, %c79 : index + %extracted = tensor.extract %arg0[%3, %2] : tensor<80x16xf32> + linalg.yield %extracted : f32 + } -> tensor<1x4xf32> + return %1 : tensor<1x4xf32> +} + +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_gather( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { +// CHECK: %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> +// CHECK: %[[VAL_3:.*]] = arith.constant dense<1264> : vector<1x4xindex> +// CHECK: %[[VAL_4:.*]] = arith.constant dense : vector<1x4xi1> +// CHECK: %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32> +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_7:.*]] = vector.broadcast %[[VAL_2]] : vector<4xindex> to vector<1x4xindex> +// CHECK: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : vector<1x4xindex> +// CHECK: %[[VAL_9:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_8]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32> +// CHECK: %[[VAL_10:.*]] = vector.transfer_write %[[VAL_9]], %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> +// CHECK: return %[[VAL_10]] : tensor<1x4xf32> +// CHECK: } + +transform.sequence failures(propagate) { + ^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation + %2 = transform.structured.vectorize %1 { vectorize_nd_extract } + } + +// ----- + +// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load. +func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { + %c16 = arith.constant 16 : index + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } outs(%extracted_slice : tensor<1x4xf32>) { + ^bb0(%out: f32): + %2 = linalg.index 0 : index + %3 = linalg.index 1 : index + %4 = arith.maxsi %2, %c16 : index + %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32> + linalg.yield %extracted : f32 + } -> tensor<1x4xf32> + return %1 : tensor<1x4xf32> +} + +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { +// CHECK: %[[VAL_2:.*]] = arith.constant dense<16> : vector<1x4xindex> +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_6:.*]] = vector.shape_cast %[[VAL_2]] : vector<1x4xindex> to vector<4xindex> +// CHECK: %[[VAL_7:.*]] = vector.extractelement %[[VAL_6]]{{\[}}%[[VAL_3]] : i32] : vector<4xindex> +// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> +// CHECK: %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> +// CHECK: return %[[VAL_9]] : tensor<1x4xf32> +// CHECK: } + +transform.sequence failures(propagate) { + ^bb1(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation + %2 = transform.structured.vectorize %1 { vectorize_nd_extract } + } + +// ----- + +// The vectorizer assumes it's a gather load whenever using a block argument to calculate an index. +#map = affine_map<(d0) -> (d0)> +func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1: tensor<5xindex>) -> tensor<5xf32> { + %0 = tensor.empty() : tensor<5xf32> + %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg1: tensor<5xindex>) outs(%0 : tensor<5xf32>) { + ^bb0(%in: index, %out: f32): + %2 = linalg.index 0 : index + %extracted_0 = tensor.extract %arg0[%in, %2] : tensor<5x6xf32> + linalg.yield %extracted_0 : f32 + } -> tensor<5xf32> + return %1 : tensor<5xf32> +} + +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_block_arg( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<5x6xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<5xindex>) -> tensor<5xf32> { +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4]> : vector<5xindex> +// CHECK: %[[VAL_4:.*]] = arith.constant dense : vector<5xi1> +// CHECK: %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32> +// CHECK: %[[VAL_6:.*]] = arith.constant dense<6> : vector<5xindex> +// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<5xf32> +// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_2]]], %[[VAL_2]] {in_bounds = [true]} : tensor<5xindex>, vector<5xindex> +// CHECK: %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_6]] : vector<5xindex> +// CHECK: %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_3]] : vector<5xindex> +// CHECK: %[[VAL_11:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_2]], %[[VAL_2]]] {{\[}}%[[VAL_10]]], %[[VAL_4]], %[[VAL_5]] : tensor<5x6xf32>, vector<5xindex>, vector<5xi1>, vector<5xf32> into vector<5xf32> +// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_7]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32> +// CHECK: return %[[VAL_12]] : tensor<5xf32> +// CHECK: } + transform.sequence failures(propagate) { ^bb1(%arg1: !pdl.operation): %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation