diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -406,6 +406,89 @@
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// GatherOp
+//===----------------------------------------------------------------------===//
+
+def Tensor_GatherOp : Tensor_Op<"gather", [
+    NoSideEffect
+  ]> {
+  string summary = "gather a subset of a tensor at specified indices.";
+  string description = [{
+    The `gather` operation extracts a subset of the elements from an 
+    input tensor at the given indices.
+
+    In its most general form, the tensor of indices specifies all the coordinates
+    of every element to extract (i.e. COO format). The indices are expected to be
+    confined to coordinate values that fit the range of the input tensor, 
+    otherwise the behavior is undefined.
+
+    Example:
+
+    ```
+        // For each 1x2 triple of coordinates in %indices, extract the 
+        // element (i.e. 0-D subset) at the coordinates triple in %input.
+        // This corresponds to implicit COO coordinates = [0, 1, 2]
+        //
+        %out = gather %input[%indices] : 
+          tensor<4x4x4xf32>[tensor<1x2x3xindex>] -> tensor<1x2xf32>
+    ```
+
+    A slice variant is provided that allows specifying whole slices of the input
+    tensor.
+
+    Example:
+
+    ```
+        // For each 5x6 singleton of coordinates in %indices, extract the 2-D
+        // slice [:, 1, :] at the coordinates singleton in %input.
+        //
+        %out = gather %input[%indices] coordinates = [1] : 
+          tensor<4x4x4xf32>[tensor<5x6x1xindex>] -> tensor<5x6x4x4xf32>
+    ```
+
+    An optional `unique` unit attribute may be specified to indicate that the
+    coordinates are statically guaranteed to be unique at runtime. Incorrectly
+    setting the `unique` attribute when the coordinates are not truly unique is
+    undefined behavior.
+
+    Only full slices are meant to be supported by this op, if one desires 
+    partial slices one should compose this op with other tensor ops.
+    This is to avoid a slippery slope of complexity that would make the op 
+    unusable in practice.
+
+    At the tensor-level, the index tensor is specified in an SoA form (i.e. 
+    coordinate tuple is the most minor). It is the responsibility of further 
+    lowerings and bufferiation to implement various concrete layouts.
+
+    Note: As currently specified, the operation must lower to an abstraction that
+    performs copies to the output tensor. This is because the buffer type system
+    is currently not rich enough to allow multiple non-contiguous views in the 
+    same type. This is visible more clearly in a notional buffer version of the op:
+    ```
+        // memref<?x4xf32> is a contiguous buffer of ?x4 elements, gather from
+        // random input slices must copy to the contiguous output.
+        %out = gather %input[%indices] coordinates = [1] : 
+          memref<4x4xf32>[memref<?x1xindex>] -> memref<?x4xf32>
+
+        // Nested buffer support would allow gather to view into the input data.
+        %out = gather %input[%indices] coordinates = [1] : 
+          memref<4x4xf32>[memref<?x1xindex>] -> memref<? x memref<4xf32>>
+    ```
+  }];
+
+  let arguments = (ins AnyRankedTensor:$input, 
+                       AnyRankedTensor:$indices,
+                       OptionalAttr<IndexAttr>:$coordinates,
+                       UnitAttr:$unique);
+  let results = (outs AnyRankedTensor:$result);
+
+  let assemblyFormat = [{
+    $input `[` $indices `]` () $unique
+    `:` type($input) `[` type($indices) `]` `->` type($result)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // GenerateOp
 //===----------------------------------------------------------------------===//
@@ -1211,6 +1294,189 @@
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// ParallelScatterOp
+//===----------------------------------------------------------------------===//
+
+def Tensor_ParallelScatterOp : Tensor_Op<"parallel_scatter", [
+    NoSideEffect
+  ]> {
+  string summary = [{
+    Specify the tensor scatter update of a single thread of a parent 
+    ParallelCombiningOpInterface op, at specified indices.
+  }];
+  string description = [{
+    The `parallel_scatter` operation yields a subset tensor value to its parent 
+    ParallelCombiningOpInterface. These subset tensor values are aggregated to
+    in some unspecified order into a full tensor value returned by the parent 
+    parallel iterating op.
+    The `parallel_scatter` is one such op allowed in the 
+    ParallelCombiningOpInterface op.
+
+    Conflicting scatter indices result in undefined semantics, in that the indices
+    written to by multiple parallel updates might contain data from any of the 
+    updates, or even a malformed bit pattern.
+
+    In the future, a combinator region or symbol may be added to allow atomic RMW
+    semantics.
+
+    If an index is updated exactly once, the value contained at that index
+    in the resulting tensor will be equal to the value at a corresponding index
+    of a slice that was used for the updated. If an index is not updated at all,
+    its value will be equal to the one in the original tensor.
+
+    This op does not create a new value, which allows maintaining a clean
+    separation between the subset and full tensor.
+
+    Note that we cannot mark this operation as pure (NoSideEffects), even
+    though it has no side effects, because it will get DCEd during
+    canonicalization.
+
+    In its most general form, the tensor of indices specifies all the coordinates
+    of every element to extract (i.e. COO format). The indices are expected to be
+    confined to coordinate values that fit the range of the dest tensor, 
+    otherwise the behavior is undefined.
+
+    Example:
+
+    ```
+        // For each 1x2 triple of coordinates in %indices, insert the 
+        // element (i.e. 0-D subset) at the coordinates triple in %dest.
+        // This corresponds to implicit COO coordinates = [0, 1, 2]
+        //
+        parallel_scatter %input into %dest[%indices] : 
+          tensor<1x2xf32> into tensor<4x4x4xf32>[tensor<1x2x3xindex>] 
+            -> tensor<4x4x4xf32>
+    ```
+
+    A slice variant is provided that allows specifying whole tensor slices.
+
+    Example:
+
+    ```
+        // For each 5x6 singleton of coordinates in %indices, insert the 2-D
+        // slice [:, 1, :] at the coordinates singleton into %dest.
+        //
+        parallel_scatter %input into %dest[%indices] coordinates = [1] : 
+          tensor<5x6x4x4xf32> into tensor<4x4x4xf32>[tensor<5x6x1xindex>] 
+            -> tensor<4x4x4xf32>
+    ```
+
+    Only full slices are meant to be supported by this op, if one desires 
+    partial slices one should compose this op with other tensor ops.
+    This is to avoid a slippery slope of complexity that would make the op 
+    unusable in practice.
+
+    At the tensor-level, the index tensor is specified in an SoA form (i.e. 
+    coordinate tuple is the most minor). It is the responsibility of further 
+    lowerings and bufferiation to implement various concrete layouts.
+
+    After buffer allocation, the "parallel_scatter" op is expected to lower
+    into a yet-undefined buffer view op that supports nested buffer types (see
+    the discussion about copy vs view in the documentation of GatherOp and 
+    ScatterOp: the copy behavior is deemed prohibitive for the `parallel_scatter`
+    op).
+  }];
+
+  let arguments = (ins AnyRankedTensor:$input, 
+                       AnyRankedTensor:$indices,
+                       OptionalAttr<IndexAttr>:$coordinates,
+                       UnitAttr:$unique);
+  let results = (outs AnyRankedTensor:$result);
+
+  let assemblyFormat = [{
+    $input `into` $dest `[` $indices `]` () $unique
+    `:` type($input) `into` type($dest) `[` type($indices) `]` `->` type($result)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// ScatterOp
+//===----------------------------------------------------------------------===//
+
+def Tensor_ScatterOp : Tensor_Op<"scatter", [
+    NoSideEffect
+  ]> {
+  string summary = "scatter a subset of a tensor at specified indices.";
+  string description = [{
+    The `scatter` operation inserts a subset of the elements from an 
+    input tensor into a destination tensor at the given indices.
+
+    In its most general form, the tensor of indices specifies all the coordinates
+    of every element to extract (i.e. COO format). The indices are expected to be
+    confined to coordinate values that fit the range of the dest tensor, 
+    otherwise the behavior is undefined.
+
+    Example:
+
+    ```
+        // For each 1x2 triple of coordinates in %indices, insert the 
+        // element (i.e. 0-D subset) at the coordinates triple in %dest.
+        // This corresponds to implicit COO coordinates = [0, 1, 2]
+        //
+        %out = scatter %input into %dest[%indices] : 
+          tensor<1x2xf32> into tensor<4x4x4xf32>[tensor<1x2x3xindex>] 
+            -> tensor<4x4x4xf32>
+    ```
+
+    A slice variant is provided that allows specifying whole tensor slices.
+
+    Example:
+
+    ```
+        // For each 5x6 singleton of coordinates in %indices, insert the 2-D
+        // slice [:, 1, :] at the coordinates singleton into %dest.
+        //
+        %out = scatter %input into %dest[%indices] coordinates = [1] : 
+          tensor<5x6x4x4xf32> into tensor<4x4x4xf32>[tensor<5x6x1xindex>] 
+            -> tensor<4x4x4xf32>
+    ```
+
+    An optional `unique` unit attribute may be specified to indicate that the
+    coordinates are statically guaranteed to be unique at runtime. Incorrectly
+    setting the `unique` attribute when the coordinates are not truly unique is
+    undefined behavior.
+
+    Only full slices are meant to be supported by this op, if one desires 
+    partial slices one should compose this op with other tensor ops.
+    This is to avoid a slippery slope of complexity that would make the op 
+    unusable in practice.
+
+    At the tensor-level, the index tensor is specified in an SoA form (i.e. 
+    coordinate tuple is the most minor). It is the responsibility of further 
+    lowerings and bufferiation to implement various concrete layouts.
+
+    Note: As currently specified, the operation must lower to an abstraction that
+    performs copies to the output tensor. This is because the buffer type system
+    is currently not rich enough to allow multiple non-contiguous views in the 
+    same type. This is visible more clearly in a notional buffer version of the op:
+    ```
+        // memref<?x4xf32> is a contiguous buffer of ?x4 elements, scatter into
+        // random input slices must copy to the contiguous dest.
+        some_side_effecting_op_writing_into %input, ...: memref<4x4xf32>
+        scatter %input into %dest[%indices] coordinates = [1] : 
+          memref<4x4xf32> into memref<?x4xf32>[memref<?x1xindex>]
+
+        // Nested buffer support in the producing op would allow writing directly
+        // into the dest buffer.
+        %v = some_nested_buffer_view_op %dest[%indices] coordinates = [1] : 
+          memref<? x memref<4xf32>>
+        some_side_effecting_op_writing_into %v, ...: memref<? x memref<4xf32>>
+    ```
+  }];
+
+  let arguments = (ins AnyRankedTensor:$input, 
+                       AnyRankedTensor:$indices,
+                       OptionalAttr<IndexAttr>:$coordinates,
+                       UnitAttr:$unique);
+  let results = (outs AnyRankedTensor:$result);
+
+  let assemblyFormat = [{
+    $input `into` $dest `[` $indices `]` () $unique
+    `:` type($input) `into` type($dest) `[` type($indices) `]` `->` type($result)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // SplatOp
 //===----------------------------------------------------------------------===//