diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -188,55 +188,50 @@
     // Create and insert the operation.
     auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(
         currentLocation, argTy, ifClauseOperand, numThreadsClauseOperand,
-        defaultClauseOperand.dyn_cast_or_null<StringAttr>(),
+        defaultClauseOperand.dyn_cast_or_null<omp::ClauseDefaultAttr>(),
         privateClauseOperands, firstprivateClauseOperands, sharedClauseOperands,
         copyinClauseOperands, ValueRange(), ValueRange(),
-        procBindClauseOperand.dyn_cast_or_null<StringAttr>());
+        procBindClauseOperand.dyn_cast_or_null<omp::ClauseProcBindKindAttr>());
     // Handle attribute based clauses.
     for (const auto &clause : parallelOpClauseList.v) {
       if (const auto &defaultClause =
               std::get_if<Fortran::parser::OmpClause::Default>(&clause.u)) {
         const auto &ompDefaultClause{defaultClause->v};
+        omp::ClauseDefault clause;
         switch (ompDefaultClause.v) {
         case Fortran::parser::OmpDefaultClause::Type::Private:
-          parallelOp.default_valAttr(firOpBuilder.getStringAttr(
-              omp::stringifyClauseDefault(omp::ClauseDefault::defprivate)));
+          clause = omp::ClauseDefault::defprivate;
           break;
         case Fortran::parser::OmpDefaultClause::Type::Firstprivate:
-          parallelOp.default_valAttr(
-              firOpBuilder.getStringAttr(omp::stringifyClauseDefault(
-                  omp::ClauseDefault::deffirstprivate)));
+          clause = omp::ClauseDefault::deffirstprivate;
           break;
         case Fortran::parser::OmpDefaultClause::Type::Shared:
-          parallelOp.default_valAttr(firOpBuilder.getStringAttr(
-              omp::stringifyClauseDefault(omp::ClauseDefault::defshared)));
+          clause = omp::ClauseDefault::defshared;
           break;
         case Fortran::parser::OmpDefaultClause::Type::None:
-          parallelOp.default_valAttr(firOpBuilder.getStringAttr(
-              omp::stringifyClauseDefault(omp::ClauseDefault::defnone)));
+          clause = omp::ClauseDefault::defnone;
           break;
         }
+        parallelOp.default_valAttr(
+            omp::ClauseDefaultAttr::get(firOpBuilder.getContext(), clause));
       }
       if (const auto &procBindClause =
               std::get_if<Fortran::parser::OmpClause::ProcBind>(&clause.u)) {
         const auto &ompProcBindClause{procBindClause->v};
+        omp::ClauseProcBindKind pbKind;
         switch (ompProcBindClause.v) {
         case Fortran::parser::OmpProcBindClause::Type::Master:
-          parallelOp.proc_bind_valAttr(
-              firOpBuilder.getStringAttr(omp::stringifyClauseProcBindKind(
-                  omp::ClauseProcBindKind::master)));
+          pbKind = omp::ClauseProcBindKind::master;
           break;
         case Fortran::parser::OmpProcBindClause::Type::Close:
-          parallelOp.proc_bind_valAttr(
-              firOpBuilder.getStringAttr(omp::stringifyClauseProcBindKind(
-                  omp::ClauseProcBindKind::close)));
+          pbKind = omp::ClauseProcBindKind::close;
           break;
         case Fortran::parser::OmpProcBindClause::Type::Spread:
-          parallelOp.proc_bind_valAttr(
-              firOpBuilder.getStringAttr(omp::stringifyClauseProcBindKind(
-                  omp::ClauseProcBindKind::spread)));
+          pbKind = omp::ClauseProcBindKind::spread;
           break;
         }
+        parallelOp.proc_bind_valAttr(omp::ClauseProcBindKindAttr::get(
+            firOpBuilder.getContext(), pbKind));
       }
     }
     createBodyOfOp<omp::ParallelOp>(parallelOp, firOpBuilder, currentLocation);
diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -27,4 +27,9 @@
 mlir_tablegen(GPUOpsEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRGPUOpsEnumsGen)
 
-add_mlir_doc(Passes GPUPasses ./ -gen-pass-doc)
+set(LLVM_TARGET_DEFINITIONS GPUOps.td)
+mlir_tablegen(GPUOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=gpu)
+mlir_tablegen(GPUOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=gpu)
+add_public_tablegen_target(MLIRGPUOpsAttributesIncGen)
+
+add_mlir_doc(Passes GPUPasses ./ -gen-pass-docm)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td
--- a/mlir/include/mlir/Dialect/GPU/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td
@@ -53,6 +53,7 @@
   }];
 
   let dependentDialects = ["arith::ArithmeticDialect"];
+  let useDefaultAttributePrinterParser = 1;
 }
 
 def GPU_AsyncToken : DialectType<
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -173,6 +173,9 @@
 
 #include "mlir/Dialect/GPU/GPUOpInterfaces.h.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/GPU/GPUOpsAttributes.h.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/GPUOps.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -16,6 +16,7 @@
 include "mlir/Dialect/DLTI/DLTIBase.td"
 include "mlir/Dialect/GPU/GPUBase.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/IR/EnumAttr.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/DataLayoutInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
@@ -28,10 +29,22 @@
 class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
     Op<GPU_Dialect, mnemonic, traits>;
 
+def GPU_Dimension : I32EnumAttr<"Dimension",
+    "a dimension, either 'x', 'y', or 'z'",
+    [
+      I32EnumAttrCase<"x", 0>,
+      I32EnumAttrCase<"y", 1>,
+      I32EnumAttrCase<"z", 2>
+    ]>{
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::gpu";
+}
+def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
+
 class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :
     GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
-    Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> {
-  let verifier = [{ return ::verifyIndexOp(*this); }];
+    Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
+  let assemblyFormat = "$dimension attr-dict";
 }
 
 def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
@@ -42,7 +55,7 @@
     Example:
 
     ```mlir
-    %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+    %bDimX = gpu.block_dim x
     ```
   }];
 }
@@ -54,7 +67,7 @@
     Example:
 
     ```mlir
-    %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+    %bIdY = gpu.block_id y
     ```
   }];
 }
@@ -66,7 +79,7 @@
     Example:
 
     ```mlir
-    %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+    %gDimZ = gpu.grid_dim z
     ```
   }];
 }
@@ -78,7 +91,7 @@
     Example:
 
     ```mlir
-    %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+    %tIdX = gpu.thread_id x
     ```
   }];
 }
@@ -339,21 +352,21 @@
           // Operations that produce block/thread IDs and dimensions are
           // injected when outlining the `gpu.launch` body to a function called
           // by `gpu.launch_func`.
-          %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
-          %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
-          %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+          %tIdX = gpu.thread_id x
+          %tIdY = gpu.thread_id y
+          %tIdZ = gpu.thread_id z
 
-          %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
-          %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
-          %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+          %bDimX = gpu.block_dim x
+          %bDimY = gpu.block_dim y
+          %bDimZ = gpu.block_dim z
 
-          %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
-          %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
-          %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+          %bIdX = gpu.block_id x
+          %bIdY = gpu.block_id y
+          %bIdZ = gpu.block_id z
 
-          %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
-          %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
-          %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+          %gDimX = gpu.grid_dim x
+          %gDimY = gpu.grid_dim y
+          %gDimZ = gpu.grid_dim z
 
           "some_op"(%bx, %tx) : (index, index) -> ()
           %42 = load %arg1[%bx] : memref<?xf32, 1>
@@ -608,15 +621,15 @@
 }
 
 // add, mul mirror the XLA ComparisonDirection enum.
-def GPU_AllReduceOpAdd : StrEnumAttrCase<"ADD", -1, "add">;
-def GPU_AllReduceOpAnd : StrEnumAttrCase<"AND", -1, "and">;
-def GPU_AllReduceOpMax : StrEnumAttrCase<"MAX", -1, "max">;
-def GPU_AllReduceOpMin : StrEnumAttrCase<"MIN", -1, "min">;
-def GPU_AllReduceOpMul : StrEnumAttrCase<"MUL", -1, "mul">;
-def GPU_AllReduceOpOr : StrEnumAttrCase<"OR", -1, "or">;
-def GPU_AllReduceOpXor : StrEnumAttrCase<"XOR", -1, "xor">;
-
-def GPU_AllReduceOperationAttr : StrEnumAttr<"AllReduceOperationAttr",
+def GPU_AllReduceOpAdd : I32EnumAttrCase<"ADD", 0, "add">;
+def GPU_AllReduceOpAnd : I32EnumAttrCase<"AND", 1, "and">;
+def GPU_AllReduceOpMax : I32EnumAttrCase<"MAX", 2, "max">;
+def GPU_AllReduceOpMin : I32EnumAttrCase<"MIN", 3, "min">;
+def GPU_AllReduceOpMul : I32EnumAttrCase<"MUL", 4, "mul">;
+def GPU_AllReduceOpOr  : I32EnumAttrCase<"OR",  5, "or">;
+def GPU_AllReduceOpXor : I32EnumAttrCase<"XOR", 6, "xor">;
+
+def GPU_AllReduceOperation : I32EnumAttr<"AllReduceOperation",
     "built-in reduction operations supported by gpu.allreduce.",
     [
       GPU_AllReduceOpAdd,
@@ -627,8 +640,11 @@
       GPU_AllReduceOpOr,
       GPU_AllReduceOpXor
     ]>{
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::gpu";
 }
+def GPU_AllReduceOperationAttr : EnumAttr<GPU_Dialect, GPU_AllReduceOperation,
+                                          "all_reduce_op">;
 
 def GPU_AllReduceOp : GPU_Op<"all_reduce",
     [SameOperandsAndResultType, IsolatedFromAbove]>,
@@ -643,12 +659,12 @@
     For example, both
 
     ```mlir
-    %1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
-    %2 = "gpu.all_reduce"(%0) ({
+    %1 = gpu.all_reduce add %0 {} : (f32) -> (f32)
+    %2 = gpu.all_reduce %0 {
     ^bb(%lhs : f32, %rhs : f32):
       %sum = arith.addf %lhs, %rhs : f32
       "gpu.yield"(%sum) : (f32) -> ()
-    }) : (f32) -> (f32)
+    } : (f32) -> (f32)
     ```
 
     compute the sum of each work item's %0 value. The first version specifies
@@ -661,31 +677,33 @@
   }];
   let regions = (region AnyRegion:$body);
   let verifier = [{ return ::verifyAllReduce(*this); }];
+  let assemblyFormat = "custom<AllReduceOperation>($op) $value $body attr-dict `:` functional-type(operands, results)";
 }
 
-def GPU_ShuffleOpXor : StrEnumAttrCase<"XOR", -1, "xor">;
-def GPU_ShuffleOpDown : StrEnumAttrCase<"DOWN", -1, "down">;
-def GPU_ShuffleOpUp : StrEnumAttrCase<"UP", -1, "up">;
-def GPU_ShuffleOpIdx : StrEnumAttrCase<"IDX", -1, "idx">;
+def GPU_ShuffleOpXor  : I32EnumAttrCase<"XOR",  0, "xor">;
+def GPU_ShuffleOpDown : I32EnumAttrCase<"DOWN", 1, "down">;
+def GPU_ShuffleOpUp   : I32EnumAttrCase<"UP",   2, "up">;
+def GPU_ShuffleOpIdx  : I32EnumAttrCase<"IDX",  3, "idx">;
 
-def GPU_ShuffleModeAttr : StrEnumAttr<"ShuffleModeAttr",
+def GPU_ShuffleMode : I32EnumAttr<"ShuffleMode",
     "Indexing modes supported by gpu.shuffle.",
     [
       GPU_ShuffleOpXor, GPU_ShuffleOpUp, GPU_ShuffleOpDown, GPU_ShuffleOpIdx,
-    ]>{
+    ]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::gpu";
-  let storageType = "mlir::StringAttr";
-  let returnType = "::mlir::gpu::ShuffleModeAttr";
-  let convertFromStorage =
-    "*symbolizeEnum<::mlir::gpu::ShuffleModeAttr>($_self.getValue())";
-  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
 }
+def GPU_ShuffleModeAttr : EnumAttr<GPU_Dialect, GPU_ShuffleMode,
+                                   "shuffle_mode">;
 
+def I32OrF32 : TypeConstraint<Or<[I32.predicate, F32.predicate]>,
+                                 "i32 or f32">;
 
-def GPU_ShuffleOp : GPU_Op<"shuffle", [NoSideEffect]>,
-    Arguments<(ins AnyType:$value, I32:$offset, I32:$width,
+def GPU_ShuffleOp : GPU_Op<
+    "shuffle", [NoSideEffect, AllTypesMatch<["value", "result"]>]>,
+    Arguments<(ins I32OrF32:$value, I32:$offset, I32:$width,
                GPU_ShuffleModeAttr:$mode)>,
-    Results<(outs AnyType:$result, I1:$valid)> {
+    Results<(outs I32OrF32:$result, I1:$valid)> {
   let summary = "Shuffles values within a subgroup.";
   let description = [{
     The "shuffle" op moves values to a different invocation within the same
@@ -706,9 +724,7 @@
     in the shuffle. Exactly the first `width` invocations of a subgroup need to
     execute this op in convergence.
   }];
-  let verifier = [{ return ::verifyShuffleOp(*this); }];
-  let printer = [{ printShuffleOp(p, *this); }];
-  let parser = [{ return parseShuffleOp(parser, result); }];
+  let assemblyFormat = "$mode $value `,` $offset `,` $width attr-dict `:` type($value)";
 }
 
 def GPU_BarrierOp : GPU_Op<"barrier"> {
@@ -1151,22 +1167,25 @@
   }];
 }
 
-def GPU_ELEMENTWISE_OP_ADD : StrEnumAttrCase<"ADDF">;
-def GPU_ELEMENTWISE_OP_MUL : StrEnumAttrCase<"MULF">;
-def GPU_ELEMENTWISE_OP_MAXF : StrEnumAttrCase<"MAXF">;
-def GPU_ELEMENTWISE_OP_MINF : StrEnumAttrCase<"MINF">;
-def GPU_ELEMENTWISE_OP_DIVF : StrEnumAttrCase<"DIVF">;
-
-def MMAElementWiseAttr : StrEnumAttr<"MMAElementwiseOp",
-  "elementwise operation to apply to mma matrix",
-  [GPU_ELEMENTWISE_OP_ADD, GPU_ELEMENTWISE_OP_MUL,
-   GPU_ELEMENTWISE_OP_MAXF, GPU_ELEMENTWISE_OP_MINF, GPU_ELEMENTWISE_OP_DIVF]> {
+def GPU_ElementwiseOpAdd  : I32EnumAttrCase<"ADDF", 0, "addf">;
+def GPU_ElementwiseOpMul  : I32EnumAttrCase<"MULF", 1, "mulf">;
+def GPU_ElementwiseOpMaxF : I32EnumAttrCase<"MAXF", 2, "maxf">;
+def GPU_ElementwiseOpMinF : I32EnumAttrCase<"MINF", 3, "minf">;
+def GPU_ElementwiseOpDivF : I32EnumAttrCase<"DIVF", 4, "divf">;
+
+def MMAElementWise : I32EnumAttr<"MMAElementwiseOp",
+  "elementwise operation to apply to mma matrix", [
+    GPU_ElementwiseOpAdd,
+    GPU_ElementwiseOpMul,
+    GPU_ElementwiseOpMaxF,
+    GPU_ElementwiseOpMinF,
+    GPU_ElementwiseOpDivF
+  ]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::gpu";
-  let storageType = "::mlir::StringAttr";
-  let returnType = "::mlir::gpu::MMAElementwiseOp";
-  let convertFromStorage = "*symbolizeMMAElementwiseOp($_self.getValue())";
-  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
 }
+def MMAElementWiseAttr : EnumAttr<GPU_Dialect, MMAElementWise,
+                                  "mma_element_wise">;
 
 def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
     [NoSideEffect,
@@ -1193,7 +1212,8 @@
     ```
   }];
 
-  let arguments = (ins Variadic<GPU_MMAMatrix>:$args, MMAElementWiseAttr:$operation);
+  let arguments = (ins Variadic<GPU_MMAMatrix>:$args,
+                       MMAElementWiseAttr:$operation);
 
   let results = (outs GPU_MMAMatrix:$res);
 
@@ -1204,7 +1224,7 @@
   }];
 
   let assemblyFormat = [{
-    $args attr-dict `:` functional-type($args, $res)
+    $operation $args attr-dict `:` functional-type($args, $res)
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -35,6 +35,8 @@
 mlir_tablegen(NVVMConversions.inc -gen-llvmir-conversions)
 mlir_tablegen(NVVMOpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(NVVMOpsEnums.cpp.inc -gen-enum-defs)
+mlir_tablegen(NVVMOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=nvvm)
+mlir_tablegen(NVVMOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=nvvm)
 add_public_tablegen_target(MLIRNVVMConversionsIncGen)
 
 add_mlir_dialect(ROCDLOps rocdl)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -30,6 +30,9 @@
                                              mlir::MLIRContext *context);
 
 ///// Ops /////
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.h.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/LLVMIR/NVVMOps.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -13,6 +13,7 @@
 #ifndef NVVMIR_OPS
 #define NVVMIR_OPS
 
+include "mlir/IR/EnumAttr.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
@@ -34,6 +35,8 @@
     /// functions.
     static StringRef getKernelFuncAttrName() { return "nvvm.kernel"; }
   }];
+
+  let useDefaultAttributePrinterParser = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -100,20 +103,18 @@
   let assemblyFormat = "attr-dict";
 }
 
-def ShflKindBfly : StrEnumAttrCase<"bfly">;
-def ShflKindUp : StrEnumAttrCase<"up">;
-def ShflKindDown : StrEnumAttrCase<"down">;
-def ShflKindIdx : StrEnumAttrCase<"idx">;
+def ShflKindBfly : I32EnumAttrCase<"bfly", 0>;
+def ShflKindUp   : I32EnumAttrCase<"up", 1>;
+def ShflKindDown : I32EnumAttrCase<"down", 2>;
+def ShflKindIdx  : I32EnumAttrCase<"idx", 3>;
 
 /// Enum attribute of the different shuffle kinds.
-def ShflKind : StrEnumAttr<"ShflKind", "NVVM shuffle kind",
+def ShflKind : I32EnumAttr<"ShflKind", "NVVM shuffle kind",
   [ShflKindBfly, ShflKindUp, ShflKindDown, ShflKindIdx]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
-  let storageType = "mlir::StringAttr";
-  let returnType = "NVVM::ShflKind";
-  let convertFromStorage = "*symbolizeEnum<NVVM::ShflKind>($_self.getValue())";
-  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
 }
+def ShflKindAttr : EnumAttr<NVVM_Dialect, ShflKind, "shfl_kind">;
 
 def NVVM_ShflOp :
   NVVM_Op<"shfl.sync">,
@@ -122,7 +123,7 @@
                  LLVM_Type:$val,
                  I32:$offset,
                  I32:$mask_and_clamp,
-                 ShflKind:$kind,
+                 ShflKindAttr:$kind,
                  OptionalAttr<UnitAttr>:$return_value_and_is_valid)> {
   string llvmBuilder = [{
       auto intId = getShflIntrinsicId(
@@ -420,52 +421,52 @@
   string id = !foldl("", f, acc, el, acc # "\n" # el);
 }
 
-def MMALayoutRow : StrEnumAttrCase<"row">;
-def MMALayoutCol : StrEnumAttrCase<"col">;
+def MMALayoutRow : I32EnumAttrCase<"row", 0>;
+def MMALayoutCol : I32EnumAttrCase<"col", 1>;
 
 /// Enum attribute of the different matrix layout.
-def MMALayout : StrEnumAttr<"MMALayout", "NVVM MMA layout",
+def MMALayout : I32EnumAttr<"MMALayout", "NVVM MMA layout",
   [MMALayoutRow, MMALayoutCol]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
-  let storageType = "mlir::StringAttr";
-  let returnType = "NVVM::MMALayout";
-  let convertFromStorage = "*symbolizeEnum<NVVM::MMALayout>($_self.getValue())";
-  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
+}
+def MMALayoutAttr : EnumAttr<NVVM_Dialect, MMALayout, "mma_layout"> {
+  let assemblyFormat = "`<` $value `>`";
 }
 
-def MMATypeF16 : StrEnumAttrCase<"f16">;
-def MMATypeF32 : StrEnumAttrCase<"f32">;
-def MMATypeTF32 : StrEnumAttrCase<"tf32">;
+def MMATypeF16  : I32EnumAttrCase<"f16", 0>;
+def MMATypeF32  : I32EnumAttrCase<"f32", 1>;
+def MMATypeTF32 : I32EnumAttrCase<"tf32", 2>;
 
 /// Enum attribute of the different matrix types.
-def MMATypes : StrEnumAttr<"MMATypes", "NVVM MMA types",
+def MMATypes : I32EnumAttr<"MMATypes", "NVVM MMA types",
   [MMATypeF16, MMATypeF32, MMATypeTF32]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
-  let storageType = "mlir::StringAttr";
-  let returnType = "NVVM::MMATypes";
-  let convertFromStorage = "*symbolizeEnum<NVVM::MMATypes>($_self.getValue())";
-  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
+}
+def MMATypesAttr : EnumAttr<NVVM_Dialect, MMATypes, "mma_type"> {
+  let assemblyFormat = "`<` $value `>`";
 }
 
-def MMAFragA : StrEnumAttrCase<"a">;
-def MMAFragB : StrEnumAttrCase<"b">;
-def MMAFragC : StrEnumAttrCase<"c">;
+def MMAFragA : I32EnumAttrCase<"a", 0>;
+def MMAFragB : I32EnumAttrCase<"b", 1>;
+def MMAFragC : I32EnumAttrCase<"c", 2>;
 
 /// Enum attribute of the different frag types.
-def MMAFragAttr : StrEnumAttr<"MMAFrag", "NVVM MMA frag type",
+def MMAFrag: I32EnumAttr<"MMAFrag", "NVVM MMA frag type",
   [MMAFragA, MMAFragB, MMAFragC]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
-  let storageType = "mlir::StringAttr";
-  let returnType = "NVVM::MMAFrag";
-  let convertFromStorage = "*symbolizeEnum<NVVM::MMAFrag>($_self.getValue())";
-  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
+}
+def MMAFragAttr : EnumAttr<NVVM_Dialect, MMAFrag, "mma_frag"> {
+  let assemblyFormat = "`<` $value `>`";
 }
 
 def NVVM_WMMALoadOp: NVVM_Op<"wmma.load">,
   Results<(outs LLVM_AnyStruct:$res)>,
   Arguments<(ins LLVM_AnyPointer: $ptr, I32: $stride, I32Attr:$m,
-             I32Attr:$n, I32Attr:$k, MMALayout:$layout, MMATypes:$eltype,
-             MMAFragAttr:$frag)> {
+             I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layout,
+             MMATypesAttr:$eltype, MMAFragAttr:$frag)> {
 
   let summary = "Warp synchronous matrix load";
 
@@ -542,8 +543,8 @@
 
 def NVVM_WMMAStoreOp : NVVM_Op<"wmma.store">,
   Arguments<(ins LLVM_AnyPointer: $ptr,
-             I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayout:$layout,
-             MMATypes:$eltype, Variadic<LLVM_Type>:$args, I32: $stride)>{
+             I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layout,
+             MMATypesAttr:$eltype, Variadic<LLVM_Type>:$args, I32: $stride)>{
   let summary = "Warp synchronous matrix store";
 
   let extraClassDeclaration =
@@ -598,9 +599,9 @@
 // Base class for all the variants of WMMA mmaOps that may be defined.
 def NVVM_WMMAMmaOp : NVVM_Op<"wmma.mma">,
   Results<(outs LLVM_AnyStruct:$res)>,
-  Arguments<(ins I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayout:$layoutA,
-            MMALayout:$layoutB, MMATypes:$eltypeA, MMATypes:$eltypeB,
-            Variadic<LLVM_Type>:$args)>{
+  Arguments<(ins I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layoutA,
+             MMALayoutAttr:$layoutB, MMATypesAttr:$eltypeA,
+             MMATypesAttr:$eltypeB, Variadic<LLVM_Type>:$args)>{
   let summary = "Warp synchronous matrix-multiply accumulate using tensor cores.";
 
   let extraClassDeclaration =
diff --git a/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend/OpenACC/ACC.td)
-mlir_tablegen(AccCommon.td --gen-directive-decl)
+mlir_tablegen(AccCommon.td --gen-directive-decl --directives-dialect=OpenACC)
 add_public_tablegen_target(acc_common_td)
 
 set(LLVM_TARGET_DEFINITIONS OpenACCOps.td)
@@ -9,6 +9,8 @@
 mlir_tablegen(OpenACCOps.cpp.inc -gen-op-defs)
 mlir_tablegen(OpenACCOpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(OpenACCOpsEnums.cpp.inc -gen-enum-defs)
+mlir_tablegen(OpenACCOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=acc)
+mlir_tablegen(OpenACCOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=acc)
 add_mlir_doc(OpenACCOps OpenACCDialect Dialects/ -gen-dialect-doc)
 add_public_tablegen_target(MLIROpenACCOpsIncGen)
 add_dependencies(OpenACCDialectDocGen acc_common_td)
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -19,6 +19,9 @@
 #include "mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc"
 #include "mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/OpenACC/OpenACCOpsAttributes.h.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/OpenACC/OpenACCOps.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -13,8 +13,8 @@
 #ifndef OPENACC_OPS
 #define OPENACC_OPS
 
+include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/OpenACC/AccCommon.td"
 
 def OpenACC_Dialect : Dialect {
   let name = "acc";
@@ -25,9 +25,13 @@
     This dialect models the construct from the OpenACC 3.1 directive language.
   }];
 
+  let useDefaultAttributePrinterParser = 1;
   let cppNamespace = "::mlir::acc";
 }
 
+// AccCommon requires definition of OpenACC_Dialect.
+include "mlir/Dialect/OpenACC/AccCommon.td"
+
 // Base class for OpenACC dialect ops.
 class OpenACC_Op<string mnemonic, list<OpTrait> traits = []> :
   Op<OpenACC_Dialect, mnemonic, traits> {
@@ -38,19 +42,19 @@
 }
 
 // Reduction operation enumeration.
-def OpenACC_ReductionOpAdd : StrEnumAttrCase<"redop_add">;
-def OpenACC_ReductionOpMul : StrEnumAttrCase<"redop_mul">;
-def OpenACC_ReductionOpMax : StrEnumAttrCase<"redop_max">;
-def OpenACC_ReductionOpMin : StrEnumAttrCase<"redop_min">;
-def OpenACC_ReductionOpAnd : StrEnumAttrCase<"redop_and">;
-def OpenACC_ReductionOpOr : StrEnumAttrCase<"redop_or">;
-def OpenACC_ReductionOpXor : StrEnumAttrCase<"redop_xor">;
-def OpenACC_ReductionOpLogEqv : StrEnumAttrCase<"redop_leqv">;
-def OpenACC_ReductionOpLogNeqv : StrEnumAttrCase<"redop_lneqv">;
-def OpenACC_ReductionOpLogAnd : StrEnumAttrCase<"redop_land">;
-def OpenACC_ReductionOpLogOr : StrEnumAttrCase<"redop_lor">;
-
-def OpenACC_ReductionOpAttr : StrEnumAttr<"ReductionOpAttr",
+def OpenACC_ReductionOpAdd     : I32EnumAttrCase<"redop_add", 0>;
+def OpenACC_ReductionOpMul     : I32EnumAttrCase<"redop_mul", 1>;
+def OpenACC_ReductionOpMax     : I32EnumAttrCase<"redop_max", 2>;
+def OpenACC_ReductionOpMin     : I32EnumAttrCase<"redop_min", 3>;
+def OpenACC_ReductionOpAnd     : I32EnumAttrCase<"redop_and", 4>;
+def OpenACC_ReductionOpOr      : I32EnumAttrCase<"redop_or", 5>;
+def OpenACC_ReductionOpXor     : I32EnumAttrCase<"redop_xor", 6>;
+def OpenACC_ReductionOpLogEqv  : I32EnumAttrCase<"redop_leqv", 7>;
+def OpenACC_ReductionOpLogNeqv : I32EnumAttrCase<"redop_lneqv", 8>;
+def OpenACC_ReductionOpLogAnd  : I32EnumAttrCase<"redop_land", 9>;
+def OpenACC_ReductionOpLogOr   : I32EnumAttrCase<"redop_lor", 10>;
+
+def OpenACC_ReductionOp : I32EnumAttr<"ReductionOp",
     "built-in reduction operations supported by OpenACC",
     [OpenACC_ReductionOpAdd, OpenACC_ReductionOpMul, OpenACC_ReductionOpMax,
      OpenACC_ReductionOpMin, OpenACC_ReductionOpAnd, OpenACC_ReductionOpOr,
@@ -58,8 +62,11 @@
      OpenACC_ReductionOpLogNeqv, OpenACC_ReductionOpLogAnd,
      OpenACC_ReductionOpLogOr
     ]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::acc";
 }
+def OpenACC_ReductionOpAttr : EnumAttr<OpenACC_Dialect, OpenACC_ReductionOp,
+                                       "reduction_op">;
 
 // Type used in operation below.
 def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>;
@@ -110,7 +117,7 @@
                        Variadic<AnyType>:$attachOperands,
                        Variadic<AnyType>:$gangPrivateOperands,
                        Variadic<AnyType>:$gangFirstPrivateOperands,
-                       OptionalAttr<DefaultValue>:$defaultAttr);
+                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -188,7 +195,7 @@
                        Variadic<AnyType>:$presentOperands,
                        Variadic<AnyType>:$deviceptrOperands,
                        Variadic<AnyType>:$attachOperands,
-                       OptionalAttr<DefaultValue>:$defaultAttr);
+                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend/OpenMP/OMP.td)
-mlir_tablegen(OmpCommon.td --gen-directive-decl)
+mlir_tablegen(OmpCommon.td --gen-directive-decl --directives-dialect=OpenMP)
 add_public_tablegen_target(omp_common_td)
 
 set(LLVM_TARGET_DEFINITIONS OpenMPOps.td)
@@ -11,6 +11,8 @@
 mlir_tablegen(OpenMPOpsEnums.cpp.inc -gen-enum-defs)
 mlir_tablegen(OpenMPTypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(OpenMPTypeInterfaces.cpp.inc -gen-type-interface-defs)
+mlir_tablegen(OpenMPOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=omp)
+mlir_tablegen(OpenMPOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=omp)
 add_mlir_doc(OpenMPOps OpenMPDialect Dialects/ -gen-dialect-doc)
 add_public_tablegen_target(MLIROpenMPOpsIncGen)
 add_dependencies(OpenMPDialectDocGen omp_common_td)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
@@ -25,6 +25,9 @@
 #include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.h.inc"
 #include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.h.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.h.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOps.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -14,11 +14,11 @@
 #ifndef OPENMP_OPS
 #define OPENMP_OPS
 
+include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
-include "mlir/Dialect/OpenMP/OmpCommon.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td"
 
@@ -26,8 +26,12 @@
   let name = "omp";
   let cppNamespace = "::mlir::omp";
   let dependentDialects = ["::mlir::LLVM::LLVMDialect"];
+  let useDefaultAttributePrinterParser = 1;
 }
 
+// OmpCommon requires definition of OpenACC_Dialect.
+include "mlir/Dialect/OpenMP/OmpCommon.td"
+
 class OpenMP_Op<string mnemonic, list<OpTrait> traits = []> :
       Op<OpenMP_Dialect, mnemonic, traits>;
 
@@ -60,18 +64,21 @@
 //===----------------------------------------------------------------------===//
 
 // Possible values for the default clause
-def ClauseDefaultPrivate : StrEnumAttrCase<"defprivate">;
-def ClauseDefaultFirstPrivate : StrEnumAttrCase<"deffirstprivate">;
-def ClauseDefaultShared : StrEnumAttrCase<"defshared">;
-def ClauseDefaultNone : StrEnumAttrCase<"defnone">;
+def ClauseDefaultPrivate      : I32EnumAttrCase<"defprivate",      0>;
+def ClauseDefaultFirstPrivate : I32EnumAttrCase<"deffirstprivate", 1>;
+def ClauseDefaultShared       : I32EnumAttrCase<"defshared",       2>;
+def ClauseDefaultNone         : I32EnumAttrCase<"defnone",         3>;
 
-def ClauseDefault : StrEnumAttr<
+def ClauseDefault : I32EnumAttr<
     "ClauseDefault",
     "default clause",
     [ClauseDefaultPrivate, ClauseDefaultFirstPrivate, ClauseDefaultShared,
      ClauseDefaultNone]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::omp";
 }
+def ClauseDefaultAttr : EnumAttr<OpenMP_Dialect, ClauseDefault,
+                                 "clause_default">;
 
 def ParallelOp : OpenMP_Op<"parallel", [AttrSizedOperandSegments,
                  DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>]> {
@@ -105,14 +112,14 @@
 
   let arguments = (ins Optional<AnyType>:$if_expr_var,
              Optional<AnyType>:$num_threads_var,
-             OptionalAttr<ClauseDefault>:$default_val,
+             OptionalAttr<ClauseDefaultAttr>:$default_val,
              Variadic<AnyType>:$private_vars,
              Variadic<AnyType>:$firstprivate_vars,
              Variadic<AnyType>:$shared_vars,
              Variadic<AnyType>:$copyin_vars,
              Variadic<AnyType>:$allocate_vars,
              Variadic<AnyType>:$allocators_vars,
-             OptionalAttr<ProcBindKind>:$proc_bind_val);
+             OptionalAttr<ProcBindKindAttr>:$proc_bind_val);
 
   let regions = (region AnyRegion:$region);
 
@@ -136,19 +143,20 @@
   let assemblyFormat = "attr-dict";
 }
 
-def OMP_SCHEDULE_MOD_None : StrEnumAttrCase<"none", 0>;
-def OMP_SCHEDULE_MOD_Monotonic : StrEnumAttrCase<"monotonic", 1>;
-def OMP_SCHEDULE_MOD_Nonmonotonic : StrEnumAttrCase<"nonmonotonic", 2>;
-def OMP_SCHEDULE_MOD_SIMD : StrEnumAttrCase<"simd", 3>;
-
-def ScheduleModifier : StrEnumAttr<"ScheduleModifier", "OpenMP Schedule Modifier",
-                                         [OMP_SCHEDULE_MOD_None,
-                                          OMP_SCHEDULE_MOD_Monotonic,
-                                          OMP_SCHEDULE_MOD_Nonmonotonic,
-                                          OMP_SCHEDULE_MOD_SIMD]>
-{
-    let cppNamespace = "::mlir::omp";
+def OMP_ScheduleModNone         : I32EnumAttrCase<"none", 0>;
+def OMP_ScheduleModMonotonic    : I32EnumAttrCase<"monotonic", 1>;
+def OMP_ScheduleModNonmonotonic : I32EnumAttrCase<"nonmonotonic", 2>;
+def OMP_ScheduleModSIMD         : I32EnumAttrCase<"simd", 3>;
+
+def ScheduleModifier
+    : I32EnumAttr<"ScheduleModifier", "OpenMP Schedule Modifier",
+                  [OMP_ScheduleModNone, OMP_ScheduleModMonotonic,
+                   OMP_ScheduleModNonmonotonic, OMP_ScheduleModSIMD]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
 }
+def ScheduleModifierAttr : EnumAttr<OpenMP_Dialect, ScheduleModifier,
+                                    "sched_mod">;
 
 //===----------------------------------------------------------------------===//
 // 2.8.1 Sections Construct
@@ -288,14 +296,14 @@
              Variadic<AnyType>:$linear_step_vars,
              Variadic<OpenMP_PointerLikeType>:$reduction_vars,
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
-             OptionalAttr<ScheduleKind>:$schedule_val,
+             OptionalAttr<ScheduleKindAttr>:$schedule_val,
              Optional<AnyType>:$schedule_chunk_var,
-             OptionalAttr<ScheduleModifier>:$schedule_modifier,
+             OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
              UnitAttr:$simd_modifier,
              Confined<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$collapse_val,
              UnitAttr:$nowait,
              Confined<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val,
-             OptionalAttr<OrderKind>:$order_val,
+             OptionalAttr<OrderKindAttr>:$order_val,
              UnitAttr:$inclusive);
 
   let skipDefaultBuilders = 1;
@@ -487,15 +495,19 @@
 // [5.1] 2.19.9 ordered Construct
 //===----------------------------------------------------------------------===//
 
-def ClauseDependSource : StrEnumAttrCase<"dependsource">;
-def ClauseDependSink : StrEnumAttrCase<"dependsink">;
+def ClauseDependSource : I32EnumAttrCase<"dependsource", 0>;
+def ClauseDependSink   : I32EnumAttrCase<"dependsink",   1>;
 
-def ClauseDepend : StrEnumAttr<
+def ClauseDepend : I32EnumAttr<
     "ClauseDepend",
     "depend clause",
     [ClauseDependSource, ClauseDependSink]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::omp";
 }
+def ClauseDependAttr : EnumAttr<OpenMP_Dialect, ClauseDepend, "clause_depend"> {
+  let assemblyFormat = "`(` $value `)`";
+}
 
 def OrderedOp : OpenMP_Op<"ordered"> {
   let summary = "ordered construct without region";
@@ -516,12 +528,12 @@
     clauses exist in one ORDERED directive.
   }];
 
-  let arguments = (ins OptionalAttr<ClauseDepend>:$depend_type_val,
+  let arguments = (ins OptionalAttr<ClauseDependAttr>:$depend_type_val,
              Confined<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$num_loops_val,
              Variadic<AnyType>:$depend_vec_vars);
 
   let assemblyFormat = [{
-    ( `depend_type` `(` $depend_type_val^ `)` )?
+    ( `depend_type` `` $depend_type_val^ )?
     ( `depend_vec` `(` $depend_vec_vars^ `:` type($depend_vec_vars) `)` )?
     attr-dict
   }];
@@ -596,7 +608,7 @@
   let arguments = (ins OpenMP_PointerLikeType:$x,
                        OpenMP_PointerLikeType:$v,
                        DefaultValuedAttr<I64Attr, "0">:$hint,
-                       OptionalAttr<MemoryOrderKind>:$memory_order);
+                       OptionalAttr<MemoryOrderKindAttr>:$memory_order);
   let parser = [{ return parseAtomicReadOp(parser, result); }];
   let printer = [{ return printAtomicReadOp(p, *this); }];
   let verifier = [{ return verifyAtomicReadOp(*this); }];
@@ -625,7 +637,7 @@
   let arguments = (ins OpenMP_PointerLikeType:$address,
                        AnyType:$value,
                        DefaultValuedAttr<I64Attr, "0">:$hint,
-                       OptionalAttr<MemoryOrderKind>:$memory_order);
+                       OptionalAttr<MemoryOrderKindAttr>:$memory_order);
   let parser = [{ return parseAtomicWriteOp(parser, result); }];
   let printer = [{ return printAtomicWriteOp(p, *this); }];
   let verifier = [{ return verifyAtomicWriteOp(*this); }];
@@ -690,7 +702,7 @@
                        UnitAttr:$isXBinopExpr,
                        AtomicBinOpKindAttr:$binop,
                        DefaultValuedAttr<I64Attr, "0">:$hint,
-                       OptionalAttr<MemoryOrderKind>:$memory_order);
+                       OptionalAttr<MemoryOrderKindAttr>:$memory_order);
   let parser = [{ return parseAtomicUpdateOp(parser, result); }];
   let printer = [{ return printAtomicUpdateOp(p, *this); }];
   let verifier = [{ return verifyAtomicUpdateOp(*this); }];
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.h
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.h
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.h
@@ -23,9 +23,9 @@
 namespace mlir {
 namespace spirv {
 enum class Capability : uint32_t;
-enum class DeviceType;
-enum class Extension;
-enum class Vendor;
+enum class DeviceType : uint32_t;
+enum class Extension : uint32_t;
+enum class Vendor : uint32_t;
 enum class Version : uint32_t;
 
 namespace detail {
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -15,6 +15,7 @@
 #ifndef MLIR_DIALECT_SPIRV_IR_BASE
 #define MLIR_DIALECT_SPIRV_IR_BASE
 
+include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/SPIRV/IR/SPIRVAvailability.td"
 
@@ -97,17 +98,15 @@
   let cppNamespace = "::mlir::spirv";
 }
 
-// Wrapper over base StrEnumAttr to set common fields.
-class SPV_StrEnumAttr<string name, string description,
-                      list<StrEnumAttrCase> cases> :
-    StrEnumAttr<name, description, cases> {
-  let predicate = And<[
-    StrAttr.predicate,
-    CPred<"::mlir::spirv::symbolize" # name # "("
-          "$_self.cast<StringAttr>().getValue()).hasValue()">,
-  ]>;
+// Wrapper over base I32EnumAttr to set common fields.
+class SPV_Enum<string name, string description, list<I32EnumAttrCase> cases>
+    : I32EnumAttr<name, description, cases> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::spirv";
 }
+class SPV_EnumAttr<string name, string description, string mnemonic,
+                      list<I32EnumAttrCase> cases> :
+    EnumAttr<SPIRV_Dialect, SPV_Enum<name, description, cases>, mnemonic>;
 
 //===----------------------------------------------------------------------===//
 // SPIR-V availability definitions
@@ -147,7 +146,7 @@
   }];
 }
 
-class Extension<list<StrEnumAttrCase> extensions> : Availability {
+class Extension<list<I32EnumAttrCase> extensions> : Availability {
   let cppNamespace = "::mlir::spirv";
   let interfaceName = "QueryExtensionInterface";
   let interfaceDescription = [{
@@ -263,31 +262,31 @@
 // SPIR-V target GPU vendor and device definitions
 //===----------------------------------------------------------------------===//
 
-def SPV_DT_CPU           : StrEnumAttrCase<"CPU">;
-def SPV_DT_DiscreteGPU   : StrEnumAttrCase<"DiscreteGPU">;
-def SPV_DT_IntegratedGPU : StrEnumAttrCase<"IntegratedGPU">;
+def SPV_DT_CPU           : I32EnumAttrCase<"CPU", 0>;
+def SPV_DT_DiscreteGPU   : I32EnumAttrCase<"DiscreteGPU", 1>;
+def SPV_DT_IntegratedGPU : I32EnumAttrCase<"IntegratedGPU", 2>;
 // An accelerator other than GPU or CPU
-def SPV_DT_Other         : StrEnumAttrCase<"Other">;
+def SPV_DT_Other         : I32EnumAttrCase<"Other", 3>;
 // Information missing.
-def SPV_DT_Unknown       : StrEnumAttrCase<"Unknown">;
+def SPV_DT_Unknown       : I32EnumAttrCase<"Unknown", 4>;
 
-def SPV_DeviceTypeAttr : SPV_StrEnumAttr<
-  "DeviceType", "valid SPIR-V device types", [
+def SPV_DeviceTypeAttr : SPV_EnumAttr<
+  "DeviceType", "valid SPIR-V device types", "device_type", [
     SPV_DT_Other, SPV_DT_IntegratedGPU, SPV_DT_DiscreteGPU,
     SPV_DT_CPU, SPV_DT_Unknown
   ]>;
 
-def SPV_V_AMD         : StrEnumAttrCase<"AMD">;
-def SPV_V_ARM         : StrEnumAttrCase<"ARM">;
-def SPV_V_Imagination : StrEnumAttrCase<"Imagination">;
-def SPV_V_Intel       : StrEnumAttrCase<"Intel">;
-def SPV_V_NVIDIA      : StrEnumAttrCase<"NVIDIA">;
-def SPV_V_Qualcomm    : StrEnumAttrCase<"Qualcomm">;
-def SPV_V_SwiftShader : StrEnumAttrCase<"SwiftShader">;
-def SPV_V_Unknown     : StrEnumAttrCase<"Unknown">;
-
-def SPV_VendorAttr : SPV_StrEnumAttr<
-  "Vendor", "recognized SPIR-V vendor strings", [
+def SPV_V_AMD         : I32EnumAttrCase<"AMD", 0>;
+def SPV_V_ARM         : I32EnumAttrCase<"ARM", 1>;
+def SPV_V_Imagination : I32EnumAttrCase<"Imagination", 2>;
+def SPV_V_Intel       : I32EnumAttrCase<"Intel", 3>;
+def SPV_V_NVIDIA      : I32EnumAttrCase<"NVIDIA", 4>;
+def SPV_V_Qualcomm    : I32EnumAttrCase<"Qualcomm", 5>;
+def SPV_V_SwiftShader : I32EnumAttrCase<"SwiftShader", 6>;
+def SPV_V_Unknown     : I32EnumAttrCase<"Unknown", 7>;
+
+def SPV_VendorAttr : SPV_EnumAttr<
+  "Vendor", "recognized SPIR-V vendor strings", "vendor", [
     SPV_V_AMD, SPV_V_ARM, SPV_V_Imagination, SPV_V_Intel,
     SPV_V_NVIDIA, SPV_V_Qualcomm, SPV_V_SwiftShader,
     SPV_V_Unknown
@@ -299,105 +298,105 @@
 
 // Extensions known to the SPIR-V dialect.
 // https://github.com/KhronosGroup/SPIRV-Registry has the full list.
-def SPV_KHR_16bit_storage                    : StrEnumAttrCase<"SPV_KHR_16bit_storage">;
-def SPV_KHR_8bit_storage                     : StrEnumAttrCase<"SPV_KHR_8bit_storage">;
-def SPV_KHR_device_group                     : StrEnumAttrCase<"SPV_KHR_device_group">;
-def SPV_KHR_float_controls                   : StrEnumAttrCase<"SPV_KHR_float_controls">;
-def SPV_KHR_physical_storage_buffer          : StrEnumAttrCase<"SPV_KHR_physical_storage_buffer">;
-def SPV_KHR_multiview                        : StrEnumAttrCase<"SPV_KHR_multiview">;
-def SPV_KHR_no_integer_wrap_decoration       : StrEnumAttrCase<"SPV_KHR_no_integer_wrap_decoration">;
-def SPV_KHR_post_depth_coverage              : StrEnumAttrCase<"SPV_KHR_post_depth_coverage">;
-def SPV_KHR_shader_atomic_counter_ops        : StrEnumAttrCase<"SPV_KHR_shader_atomic_counter_ops">;
-def SPV_KHR_shader_ballot                    : StrEnumAttrCase<"SPV_KHR_shader_ballot">;
-def SPV_KHR_shader_clock                     : StrEnumAttrCase<"SPV_KHR_shader_clock">;
-def SPV_KHR_shader_draw_parameters           : StrEnumAttrCase<"SPV_KHR_shader_draw_parameters">;
-def SPV_KHR_storage_buffer_storage_class     : StrEnumAttrCase<"SPV_KHR_storage_buffer_storage_class">;
-def SPV_KHR_subgroup_vote                    : StrEnumAttrCase<"SPV_KHR_subgroup_vote">;
-def SPV_KHR_variable_pointers                : StrEnumAttrCase<"SPV_KHR_variable_pointers">;
-def SPV_KHR_vulkan_memory_model              : StrEnumAttrCase<"SPV_KHR_vulkan_memory_model">;
-def SPV_KHR_expect_assume                    : StrEnumAttrCase<"SPV_KHR_expect_assume">;
-def SPV_KHR_integer_dot_product              : StrEnumAttrCase<"SPV_KHR_integer_dot_product">;
-def SPV_KHR_bit_instructions                 : StrEnumAttrCase<"SPV_KHR_bit_instructions">;
-def SPV_KHR_fragment_shading_rate            : StrEnumAttrCase<"SPV_KHR_fragment_shading_rate">;
-def SPV_KHR_workgroup_memory_explicit_layout : StrEnumAttrCase<"SPV_KHR_workgroup_memory_explicit_layout">;
-def SPV_KHR_ray_query                        : StrEnumAttrCase<"SPV_KHR_ray_query">;
-def SPV_KHR_ray_tracing                      : StrEnumAttrCase<"SPV_KHR_ray_tracing">;
-def SPV_KHR_subgroup_uniform_control_flow    : StrEnumAttrCase<"SPV_KHR_subgroup_uniform_control_flow">;
-def SPV_KHR_linkonce_odr                     : StrEnumAttrCase<"SPV_KHR_linkonce_odr">;
-
-def SPV_EXT_demote_to_helper_invocation  : StrEnumAttrCase<"SPV_EXT_demote_to_helper_invocation">;
-def SPV_EXT_descriptor_indexing          : StrEnumAttrCase<"SPV_EXT_descriptor_indexing">;
-def SPV_EXT_fragment_fully_covered       : StrEnumAttrCase<"SPV_EXT_fragment_fully_covered">;
-def SPV_EXT_fragment_invocation_density  : StrEnumAttrCase<"SPV_EXT_fragment_invocation_density">;
-def SPV_EXT_fragment_shader_interlock    : StrEnumAttrCase<"SPV_EXT_fragment_shader_interlock">;
-def SPV_EXT_physical_storage_buffer      : StrEnumAttrCase<"SPV_EXT_physical_storage_buffer">;
-def SPV_EXT_shader_stencil_export        : StrEnumAttrCase<"SPV_EXT_shader_stencil_export">;
-def SPV_EXT_shader_viewport_index_layer  : StrEnumAttrCase<"SPV_EXT_shader_viewport_index_layer">;
-def SPV_EXT_shader_atomic_float_add      : StrEnumAttrCase<"SPV_EXT_shader_atomic_float_add">;
-def SPV_EXT_shader_atomic_float_min_max  : StrEnumAttrCase<"SPV_EXT_shader_atomic_float_min_max">;
-def SPV_EXT_shader_image_int64           : StrEnumAttrCase<"SPV_EXT_shader_image_int64">;
-def SPV_EXT_shader_atomic_float16_add    : StrEnumAttrCase<"SPV_EXT_shader_atomic_float16_add">;
-
-def SPV_AMD_gpu_shader_half_float_fetch      : StrEnumAttrCase<"SPV_AMD_gpu_shader_half_float_fetch">;
-def SPV_AMD_shader_ballot                    : StrEnumAttrCase<"SPV_AMD_shader_ballot">;
-def SPV_AMD_shader_explicit_vertex_parameter : StrEnumAttrCase<"SPV_AMD_shader_explicit_vertex_parameter">;
-def SPV_AMD_shader_fragment_mask             : StrEnumAttrCase<"SPV_AMD_shader_fragment_mask">;
-def SPV_AMD_shader_image_load_store_lod      : StrEnumAttrCase<"SPV_AMD_shader_image_load_store_lod">;
-def SPV_AMD_texture_gather_bias_lod          : StrEnumAttrCase<"SPV_AMD_texture_gather_bias_lod">;
-
-def SPV_GOOGLE_decorate_string           : StrEnumAttrCase<"SPV_GOOGLE_decorate_string">;
-def SPV_GOOGLE_hlsl_functionality1       : StrEnumAttrCase<"SPV_GOOGLE_hlsl_functionality1">;
-def SPV_GOOGLE_user_type                 : StrEnumAttrCase<"SPV_GOOGLE_user_type">;
-
-def SPV_INTEL_device_side_avc_motion_estimation  : StrEnumAttrCase<"SPV_INTEL_device_side_avc_motion_estimation">;
-def SPV_INTEL_media_block_io                     : StrEnumAttrCase<"SPV_INTEL_media_block_io">;
-def SPV_INTEL_shader_integer_functions2          : StrEnumAttrCase<"SPV_INTEL_shader_integer_functions2">;
-def SPV_INTEL_subgroups                          : StrEnumAttrCase<"SPV_INTEL_subgroups">;
-def SPV_INTEL_float_controls2                    : StrEnumAttrCase<"SPV_INTEL_float_controls2">;
-def SPV_INTEL_function_pointers                  : StrEnumAttrCase<"SPV_INTEL_function_pointers">;
-def SPV_INTEL_inline_assembly                    : StrEnumAttrCase<"SPV_INTEL_inline_assembly">;
-def SPV_INTEL_vector_compute                     : StrEnumAttrCase<"SPV_INTEL_vector_compute">;
-def SPV_INTEL_variable_length_array              : StrEnumAttrCase<"SPV_INTEL_variable_length_array">;
-def SPV_INTEL_fpga_memory_attributes             : StrEnumAttrCase<"SPV_INTEL_fpga_memory_attributes">;
-def SPV_INTEL_arbitrary_precision_integers       : StrEnumAttrCase<"SPV_INTEL_arbitrary_precision_integers">;
-def SPV_INTEL_arbitrary_precision_floating_point : StrEnumAttrCase<"SPV_INTEL_arbitrary_precision_floating_point">;
-def SPV_INTEL_unstructured_loop_controls         : StrEnumAttrCase<"SPV_INTEL_unstructured_loop_controls">;
-def SPV_INTEL_fpga_loop_controls                 : StrEnumAttrCase<"SPV_INTEL_fpga_loop_controls">;
-def SPV_INTEL_kernel_attributes                  : StrEnumAttrCase<"SPV_INTEL_kernel_attributes">;
-def SPV_INTEL_fpga_memory_accesses               : StrEnumAttrCase<"SPV_INTEL_fpga_memory_accesses">;
-def SPV_INTEL_fpga_cluster_attributes            : StrEnumAttrCase<"SPV_INTEL_fpga_cluster_attributes">;
-def SPV_INTEL_loop_fuse                          : StrEnumAttrCase<"SPV_INTEL_loop_fuse">;
-def SPV_INTEL_fpga_buffer_location               : StrEnumAttrCase<"SPV_INTEL_fpga_buffer_location">;
-def SPV_INTEL_arbitrary_precision_fixed_point    : StrEnumAttrCase<"SPV_INTEL_arbitrary_precision_fixed_point">;
-def SPV_INTEL_usm_storage_classes                : StrEnumAttrCase<"SPV_INTEL_usm_storage_classes">;
-def SPV_INTEL_io_pipes                           : StrEnumAttrCase<"SPV_INTEL_io_pipes">;
-def SPV_INTEL_blocking_pipes                     : StrEnumAttrCase<"SPV_INTEL_blocking_pipes">;
-def SPV_INTEL_fpga_reg                           : StrEnumAttrCase<"SPV_INTEL_fpga_reg">;
-def SPV_INTEL_long_constant_composite            : StrEnumAttrCase<"SPV_INTEL_long_constant_composite">;
-def SPV_INTEL_optnone                            : StrEnumAttrCase<"SPV_INTEL_optnone">;
-def SPV_INTEL_debug_module                       : StrEnumAttrCase<"SPV_INTEL_debug_module">;
-def SPV_INTEL_fp_fast_math_mode                  : StrEnumAttrCase<"SPV_INTEL_fp_fast_math_mode">;
-
-def SPV_NV_compute_shader_derivatives    : StrEnumAttrCase<"SPV_NV_compute_shader_derivatives">;
-def SPV_NV_cooperative_matrix            : StrEnumAttrCase<"SPV_NV_cooperative_matrix">;
-def SPV_NV_fragment_shader_barycentric   : StrEnumAttrCase<"SPV_NV_fragment_shader_barycentric">;
-def SPV_NV_geometry_shader_passthrough   : StrEnumAttrCase<"SPV_NV_geometry_shader_passthrough">;
-def SPV_NV_mesh_shader                   : StrEnumAttrCase<"SPV_NV_mesh_shader">;
-def SPV_NV_ray_tracing                   : StrEnumAttrCase<"SPV_NV_ray_tracing">;
-def SPV_NV_sample_mask_override_coverage : StrEnumAttrCase<"SPV_NV_sample_mask_override_coverage">;
-def SPV_NV_shader_image_footprint        : StrEnumAttrCase<"SPV_NV_shader_image_footprint">;
-def SPV_NV_shader_sm_builtins            : StrEnumAttrCase<"SPV_NV_shader_sm_builtins">;
-def SPV_NV_shader_subgroup_partitioned   : StrEnumAttrCase<"SPV_NV_shader_subgroup_partitioned">;
-def SPV_NV_shading_rate                  : StrEnumAttrCase<"SPV_NV_shading_rate">;
-def SPV_NV_stereo_view_rendering         : StrEnumAttrCase<"SPV_NV_stereo_view_rendering">;
-def SPV_NV_viewport_array2               : StrEnumAttrCase<"SPV_NV_viewport_array2">;
-def SPV_NV_bindless_texture              : StrEnumAttrCase<"SPV_NV_bindless_texture">;
-def SPV_NV_ray_tracing_motion_blur       : StrEnumAttrCase<"SPV_NV_ray_tracing_motion_blur">;
-
-def SPV_NVX_multiview_per_view_attributes : StrEnumAttrCase<"SPV_NVX_multiview_per_view_attributes">;
+def SPV_KHR_16bit_storage                    : I32EnumAttrCase<"SPV_KHR_16bit_storage", 0>;
+def SPV_KHR_8bit_storage                     : I32EnumAttrCase<"SPV_KHR_8bit_storage", 1>;
+def SPV_KHR_device_group                     : I32EnumAttrCase<"SPV_KHR_device_group", 2>;
+def SPV_KHR_float_controls                   : I32EnumAttrCase<"SPV_KHR_float_controls", 3>;
+def SPV_KHR_physical_storage_buffer          : I32EnumAttrCase<"SPV_KHR_physical_storage_buffer", 4>;
+def SPV_KHR_multiview                        : I32EnumAttrCase<"SPV_KHR_multiview", 5>;
+def SPV_KHR_no_integer_wrap_decoration       : I32EnumAttrCase<"SPV_KHR_no_integer_wrap_decoration", 6>;
+def SPV_KHR_post_depth_coverage              : I32EnumAttrCase<"SPV_KHR_post_depth_coverage", 7>;
+def SPV_KHR_shader_atomic_counter_ops        : I32EnumAttrCase<"SPV_KHR_shader_atomic_counter_ops", 8>;
+def SPV_KHR_shader_ballot                    : I32EnumAttrCase<"SPV_KHR_shader_ballot", 9>;
+def SPV_KHR_shader_clock                     : I32EnumAttrCase<"SPV_KHR_shader_clock", 10>;
+def SPV_KHR_shader_draw_parameters           : I32EnumAttrCase<"SPV_KHR_shader_draw_parameters", 11>;
+def SPV_KHR_storage_buffer_storage_class     : I32EnumAttrCase<"SPV_KHR_storage_buffer_storage_class", 12>;
+def SPV_KHR_subgroup_vote                    : I32EnumAttrCase<"SPV_KHR_subgroup_vote", 13>;
+def SPV_KHR_variable_pointers                : I32EnumAttrCase<"SPV_KHR_variable_pointers", 14>;
+def SPV_KHR_vulkan_memory_model              : I32EnumAttrCase<"SPV_KHR_vulkan_memory_model", 15>;
+def SPV_KHR_expect_assume                    : I32EnumAttrCase<"SPV_KHR_expect_assume", 16>;
+def SPV_KHR_integer_dot_product              : I32EnumAttrCase<"SPV_KHR_integer_dot_product", 17>;
+def SPV_KHR_bit_instructions                 : I32EnumAttrCase<"SPV_KHR_bit_instructions", 18>;
+def SPV_KHR_fragment_shading_rate            : I32EnumAttrCase<"SPV_KHR_fragment_shading_rate", 19>;
+def SPV_KHR_workgroup_memory_explicit_layout : I32EnumAttrCase<"SPV_KHR_workgroup_memory_explicit_layout", 20>;
+def SPV_KHR_ray_query                        : I32EnumAttrCase<"SPV_KHR_ray_query", 21>;
+def SPV_KHR_ray_tracing                      : I32EnumAttrCase<"SPV_KHR_ray_tracing", 22>;
+def SPV_KHR_subgroup_uniform_control_flow    : I32EnumAttrCase<"SPV_KHR_subgroup_uniform_control_flow", 23>;
+def SPV_KHR_linkonce_odr                     : I32EnumAttrCase<"SPV_KHR_linkonce_odr", 24>;
+
+def SPV_EXT_demote_to_helper_invocation  : I32EnumAttrCase<"SPV_EXT_demote_to_helper_invocation", 25>;
+def SPV_EXT_descriptor_indexing          : I32EnumAttrCase<"SPV_EXT_descriptor_indexing", 26>;
+def SPV_EXT_fragment_fully_covered       : I32EnumAttrCase<"SPV_EXT_fragment_fully_covered", 27>;
+def SPV_EXT_fragment_invocation_density  : I32EnumAttrCase<"SPV_EXT_fragment_invocation_density", 28>;
+def SPV_EXT_fragment_shader_interlock    : I32EnumAttrCase<"SPV_EXT_fragment_shader_interlock", 29>;
+def SPV_EXT_physical_storage_buffer      : I32EnumAttrCase<"SPV_EXT_physical_storage_buffer", 30>;
+def SPV_EXT_shader_stencil_export        : I32EnumAttrCase<"SPV_EXT_shader_stencil_export", 31>;
+def SPV_EXT_shader_viewport_index_layer  : I32EnumAttrCase<"SPV_EXT_shader_viewport_index_layer", 32>;
+def SPV_EXT_shader_atomic_float_add      : I32EnumAttrCase<"SPV_EXT_shader_atomic_float_add", 33>;
+def SPV_EXT_shader_atomic_float_min_max  : I32EnumAttrCase<"SPV_EXT_shader_atomic_float_min_max", 34>;
+def SPV_EXT_shader_image_int64           : I32EnumAttrCase<"SPV_EXT_shader_image_int64", 35>;
+def SPV_EXT_shader_atomic_float16_add    : I32EnumAttrCase<"SPV_EXT_shader_atomic_float16_add", 36>;
+
+def SPV_AMD_gpu_shader_half_float_fetch      : I32EnumAttrCase<"SPV_AMD_gpu_shader_half_float_fetch", 37>;
+def SPV_AMD_shader_ballot                    : I32EnumAttrCase<"SPV_AMD_shader_ballot", 38>;
+def SPV_AMD_shader_explicit_vertex_parameter : I32EnumAttrCase<"SPV_AMD_shader_explicit_vertex_parameter", 39>;
+def SPV_AMD_shader_fragment_mask             : I32EnumAttrCase<"SPV_AMD_shader_fragment_mask", 40>;
+def SPV_AMD_shader_image_load_store_lod      : I32EnumAttrCase<"SPV_AMD_shader_image_load_store_lod", 41>;
+def SPV_AMD_texture_gather_bias_lod          : I32EnumAttrCase<"SPV_AMD_texture_gather_bias_lod", 42>;
+
+def SPV_GOOGLE_decorate_string           : I32EnumAttrCase<"SPV_GOOGLE_decorate_string", 43>;
+def SPV_GOOGLE_hlsl_functionality1       : I32EnumAttrCase<"SPV_GOOGLE_hlsl_functionality1", 44>;
+def SPV_GOOGLE_user_type                 : I32EnumAttrCase<"SPV_GOOGLE_user_type", 45>;
+
+def SPV_INTEL_device_side_avc_motion_estimation  : I32EnumAttrCase<"SPV_INTEL_device_side_avc_motion_estimation", 46>;
+def SPV_INTEL_media_block_io                     : I32EnumAttrCase<"SPV_INTEL_media_block_io", 47>;
+def SPV_INTEL_shader_integer_functions2          : I32EnumAttrCase<"SPV_INTEL_shader_integer_functions2", 48>;
+def SPV_INTEL_subgroups                          : I32EnumAttrCase<"SPV_INTEL_subgroups", 49>;
+def SPV_INTEL_float_controls2                    : I32EnumAttrCase<"SPV_INTEL_float_controls2", 50>;
+def SPV_INTEL_function_pointers                  : I32EnumAttrCase<"SPV_INTEL_function_pointers", 51>;
+def SPV_INTEL_inline_assembly                    : I32EnumAttrCase<"SPV_INTEL_inline_assembly", 52>;
+def SPV_INTEL_vector_compute                     : I32EnumAttrCase<"SPV_INTEL_vector_compute", 53>;
+def SPV_INTEL_variable_length_array              : I32EnumAttrCase<"SPV_INTEL_variable_length_array", 54>;
+def SPV_INTEL_fpga_memory_attributes             : I32EnumAttrCase<"SPV_INTEL_fpga_memory_attributes", 55>;
+def SPV_INTEL_arbitrary_precision_integers       : I32EnumAttrCase<"SPV_INTEL_arbitrary_precision_integers", 56>;
+def SPV_INTEL_arbitrary_precision_floating_point : I32EnumAttrCase<"SPV_INTEL_arbitrary_precision_floating_point", 57>;
+def SPV_INTEL_unstructured_loop_controls         : I32EnumAttrCase<"SPV_INTEL_unstructured_loop_controls", 58>;
+def SPV_INTEL_fpga_loop_controls                 : I32EnumAttrCase<"SPV_INTEL_fpga_loop_controls", 59>;
+def SPV_INTEL_kernel_attributes                  : I32EnumAttrCase<"SPV_INTEL_kernel_attributes", 60>;
+def SPV_INTEL_fpga_memory_accesses               : I32EnumAttrCase<"SPV_INTEL_fpga_memory_accesses", 61>;
+def SPV_INTEL_fpga_cluster_attributes            : I32EnumAttrCase<"SPV_INTEL_fpga_cluster_attributes", 62>;
+def SPV_INTEL_loop_fuse                          : I32EnumAttrCase<"SPV_INTEL_loop_fuse", 63>;
+def SPV_INTEL_fpga_buffer_location               : I32EnumAttrCase<"SPV_INTEL_fpga_buffer_location", 64>;
+def SPV_INTEL_arbitrary_precision_fixed_point    : I32EnumAttrCase<"SPV_INTEL_arbitrary_precision_fixed_point", 65>;
+def SPV_INTEL_usm_storage_classes                : I32EnumAttrCase<"SPV_INTEL_usm_storage_classes", 66>;
+def SPV_INTEL_io_pipes                           : I32EnumAttrCase<"SPV_INTEL_io_pipes", 67>;
+def SPV_INTEL_blocking_pipes                     : I32EnumAttrCase<"SPV_INTEL_blocking_pipes", 68>;
+def SPV_INTEL_fpga_reg                           : I32EnumAttrCase<"SPV_INTEL_fpga_reg", 69>;
+def SPV_INTEL_long_constant_composite            : I32EnumAttrCase<"SPV_INTEL_long_constant_composite", 70>;
+def SPV_INTEL_optnone                            : I32EnumAttrCase<"SPV_INTEL_optnone", 71>;
+def SPV_INTEL_debug_module                       : I32EnumAttrCase<"SPV_INTEL_debug_module", 72>;
+def SPV_INTEL_fp_fast_math_mode                  : I32EnumAttrCase<"SPV_INTEL_fp_fast_math_mode", 73>;
+
+def SPV_NV_compute_shader_derivatives    : I32EnumAttrCase<"SPV_NV_compute_shader_derivatives", 74>;
+def SPV_NV_cooperative_matrix            : I32EnumAttrCase<"SPV_NV_cooperative_matrix", 75>;
+def SPV_NV_fragment_shader_barycentric   : I32EnumAttrCase<"SPV_NV_fragment_shader_barycentric", 76>;
+def SPV_NV_geometry_shader_passthrough   : I32EnumAttrCase<"SPV_NV_geometry_shader_passthrough", 77>;
+def SPV_NV_mesh_shader                   : I32EnumAttrCase<"SPV_NV_mesh_shader", 78>;
+def SPV_NV_ray_tracing                   : I32EnumAttrCase<"SPV_NV_ray_tracing", 79>;
+def SPV_NV_sample_mask_override_coverage : I32EnumAttrCase<"SPV_NV_sample_mask_override_coverage", 80>;
+def SPV_NV_shader_image_footprint        : I32EnumAttrCase<"SPV_NV_shader_image_footprint", 81>;
+def SPV_NV_shader_sm_builtins            : I32EnumAttrCase<"SPV_NV_shader_sm_builtins", 82>;
+def SPV_NV_shader_subgroup_partitioned   : I32EnumAttrCase<"SPV_NV_shader_subgroup_partitioned", 83>;
+def SPV_NV_shading_rate                  : I32EnumAttrCase<"SPV_NV_shading_rate", 84>;
+def SPV_NV_stereo_view_rendering         : I32EnumAttrCase<"SPV_NV_stereo_view_rendering", 85>;
+def SPV_NV_viewport_array2               : I32EnumAttrCase<"SPV_NV_viewport_array2", 86>;
+def SPV_NV_bindless_texture              : I32EnumAttrCase<"SPV_NV_bindless_texture", 87>;
+def SPV_NV_ray_tracing_motion_blur       : I32EnumAttrCase<"SPV_NV_ray_tracing_motion_blur", 88>;
+
+def SPV_NVX_multiview_per_view_attributes : I32EnumAttrCase<"SPV_NVX_multiview_per_view_attributes", 89>;
 
 def SPV_ExtensionAttr :
-    SPV_StrEnumAttr<"Extension", "supported SPIR-V extensions", [
+    SPV_EnumAttr<"Extension", "supported SPIR-V extensions", "ext", [
       SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_device_group,
       SPV_KHR_float_controls, SPV_KHR_physical_storage_buffer, SPV_KHR_multiview,
       SPV_KHR_no_integer_wrap_decoration, SPV_KHR_post_depth_coverage,
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVEnums.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVEnums.h
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVEnums.h
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVEnums.h
@@ -24,7 +24,7 @@
 namespace mlir {
 namespace spirv {
 enum class Version : uint32_t;
-enum class Extension;
+enum class Extension : uint32_t;
 enum class Capability : uint32_t;
 } // namespace spirv
 } // namespace mlir
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -23,17 +23,8 @@
 template <typename Op, typename XOp, typename YOp, typename ZOp>
 struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
 private:
-  enum Dimension { X = 0, Y = 1, Z = 2, invalid };
   unsigned indexBitwidth;
 
-  static Dimension dimensionToIndex(Op op) {
-    return StringSwitch<Dimension>(op.dimension())
-        .Case("x", X)
-        .Case("y", Y)
-        .Case("z", Z)
-        .Default(invalid);
-  }
-
 public:
   explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter)
       : ConvertOpToLLVMPattern<Op>(typeConverter),
@@ -46,18 +37,16 @@
     auto loc = op->getLoc();
     MLIRContext *context = rewriter.getContext();
     Value newOp;
-    switch (dimensionToIndex(op)) {
-    case X:
+    switch (op.dimension()) {
+    case gpu::Dimension::x:
       newOp = rewriter.create<XOp>(loc, IntegerType::get(context, 32));
       break;
-    case Y:
+    case gpu::Dimension::y:
       newOp = rewriter.create<YOp>(loc, IntegerType::get(context, 32));
       break;
-    case Z:
+    case gpu::Dimension::z:
       newOp = rewriter.create<ZOp>(loc, IntegerType::get(context, 32));
       break;
-    default:
-      return failure();
     }
 
     if (indexBitwidth > 32) {
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -40,15 +40,15 @@
 namespace {
 
 /// Convert gpu dialect shfl mode enum to the equivalent nvvm one.
-static NVVM::ShflKind convertShflKind(gpu::ShuffleModeAttr mode) {
+static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
   switch (mode) {
-  case gpu::ShuffleModeAttr::XOR:
+  case gpu::ShuffleMode::XOR:
     return NVVM::ShflKind::bfly;
-  case gpu::ShuffleModeAttr::UP:
+  case gpu::ShuffleMode::UP:
     return NVVM::ShflKind::up;
-  case gpu::ShuffleModeAttr::DOWN:
+  case gpu::ShuffleMode::DOWN:
     return NVVM::ShflKind::down;
-  case gpu::ShuffleModeAttr::IDX:
+  case gpu::ShuffleMode::IDX:
     return NVVM::ShflKind::idx;
   }
   llvm_unreachable("unknown shuffle mode");
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -115,26 +115,10 @@
 // Builtins.
 //===----------------------------------------------------------------------===//
 
-static Optional<int32_t> getLaunchConfigIndex(Operation *op) {
-  auto dimAttr = op->getAttrOfType<StringAttr>("dimension");
-  if (!dimAttr)
-    return llvm::None;
-
-  return llvm::StringSwitch<Optional<int32_t>>(dimAttr.getValue())
-      .Case("x", 0)
-      .Case("y", 1)
-      .Case("z", 2)
-      .Default(llvm::None);
-}
-
 template <typename SourceOp, spirv::BuiltIn builtin>
 LogicalResult LaunchConfigConversion<SourceOp, builtin>::matchAndRewrite(
     SourceOp op, typename SourceOp::Adaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
-  auto index = getLaunchConfigIndex(op);
-  if (!index)
-    return failure();
-
   auto *typeConverter = this->template getTypeConverter<SPIRVTypeConverter>();
   auto indexType = typeConverter->getIndexType();
 
@@ -143,7 +127,7 @@
       spirv::getBuiltinVariableValue(op, builtin, indexType, rewriter);
   rewriter.replaceOpWithNewOp<spirv::CompositeExtractOp>(
       op, indexType, spirvBuiltin,
-      rewriter.getI32ArrayAttr({index.getValue()}));
+      rewriter.getI32ArrayAttr({static_cast<int32_t>(op.dimension())}));
   return success();
 }
 
@@ -164,12 +148,9 @@
 LogicalResult WorkGroupSizeConversion::matchAndRewrite(
     gpu::BlockDimOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
-  auto index = getLaunchConfigIndex(op);
-  if (!index)
-    return failure();
-
   auto workGroupSizeAttr = spirv::lookupLocalWorkGroupSize(op);
-  auto val = workGroupSizeAttr.getValues<int32_t>()[index.getValue()];
+  auto val = workGroupSizeAttr
+                 .getValues<int32_t>()[static_cast<int32_t>(op.dimension())];
   auto convertedType =
       getTypeConverter()->convertType(op.getResult().getType());
   if (!convertedType)
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -27,6 +27,7 @@
 
   DEPENDS
   MLIRGPUOpsIncGen
+  MLIRGPUOpsAttributesIncGen
   MLIRGPUOpsEnumsGen
   MLIRGPUOpInterfacesIncGen
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -124,6 +124,10 @@
 #define GET_OP_LIST
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
       >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/GPU/GPUOpsAttributes.cpp.inc"
+      >();
   addInterfaces<GPUInlinerInterface>();
 }
 
@@ -293,54 +297,36 @@
     if (yieldCount == 0)
       return allReduce.emitError("expected gpu.yield op in region");
   } else {
-    StringRef opName = *allReduce.op();
-    if ((opName == "and" || opName == "or" || opName == "xor") &&
+    gpu::AllReduceOperation opName = *allReduce.op();
+    if ((opName == gpu::AllReduceOperation::AND ||
+         opName == gpu::AllReduceOperation::OR ||
+         opName == gpu::AllReduceOperation::XOR) &&
         !allReduce.getType().isa<IntegerType>()) {
       return allReduce.emitError()
-             << '`' << opName << '`'
+             << '`' << gpu::stringifyAllReduceOperation(opName) << '`'
              << " accumulator is only compatible with Integer type";
     }
   }
   return success();
 }
 
-static LogicalResult verifyShuffleOp(gpu::ShuffleOp shuffleOp) {
-  auto type = shuffleOp.value().getType();
-  if (shuffleOp.result().getType() != type) {
-    return shuffleOp.emitOpError()
-           << "requires the same type for value operand and result";
-  }
-  if (!type.isSignlessIntOrFloat() || type.getIntOrFloatBitWidth() != 32) {
-    return shuffleOp.emitOpError()
-           << "requires value operand type to be f32 or i32";
+// TODO: Support optional custom attributes (without dialect prefix).
+static ParseResult parseAllReduceOperation(AsmParser &parser,
+                                           AllReduceOperationAttr &attr) {
+  StringRef enumStr;
+  if (!parser.parseOptionalKeyword(&enumStr)) {
+    Optional<AllReduceOperation> op = gpu::symbolizeAllReduceOperation(enumStr);
+    if (!op)
+      return parser.emitError(parser.getCurrentLocation(), "invalid op kind");
+    attr = AllReduceOperationAttr::get(parser.getContext(), *op);
   }
   return success();
 }
 
-static void printShuffleOp(OpAsmPrinter &p, ShuffleOp op) {
-  p << ' ' << op.getOperands() << ' ' << stringifyEnum(op.mode()) << " : "
-    << op.value().getType();
-}
-
-static ParseResult parseShuffleOp(OpAsmParser &parser, OperationState &state) {
-  SmallVector<OpAsmParser::OperandType, 3> operandInfo;
-  if (parser.parseOperandList(operandInfo, 3))
-    return failure();
-
-  StringRef mode;
-  if (parser.parseKeyword(&mode))
-    return failure();
-  state.addAttribute("mode", parser.getBuilder().getStringAttr(mode));
-
-  Type valueType;
-  Type int32Type = parser.getBuilder().getIntegerType(32);
-  Type int1Type = parser.getBuilder().getI1Type();
-  if (parser.parseColonType(valueType) ||
-      parser.resolveOperands(operandInfo, {valueType, int32Type, int32Type},
-                             parser.getCurrentLocation(), state.operands) ||
-      parser.addTypesToList({valueType, int1Type}, state.types))
-    return failure();
-  return success();
+static void printAllReduceOperation(AsmPrinter &printer, Operation *op,
+                                    AllReduceOperationAttr attr) {
+  if (attr)
+    attr.print(printer);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1203,5 +1189,8 @@
 #include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc"
 #include "mlir/Dialect/GPU/GPUOpsEnums.cpp.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/GPU/GPUOpsAttributes.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
diff --git a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
@@ -66,12 +66,12 @@
     rewriter.setInsertionPoint(reduceOp);
 
     // Compute linear invocation index and workgroup size.
-    Value dimX = getDimOp<gpu::BlockDimOp>("x");
-    Value dimY = getDimOp<gpu::BlockDimOp>("y");
-    Value dimZ = getDimOp<gpu::BlockDimOp>("z");
-    Value tidX = getDimOp<gpu::ThreadIdOp>("x");
-    Value tidY = getDimOp<gpu::ThreadIdOp>("y");
-    Value tidZ = getDimOp<gpu::ThreadIdOp>("z");
+    Value dimX = getDimOp<gpu::BlockDimOp>(gpu::Dimension::x);
+    Value dimY = getDimOp<gpu::BlockDimOp>(gpu::Dimension::y);
+    Value dimZ = getDimOp<gpu::BlockDimOp>(gpu::Dimension::z);
+    Value tidX = getDimOp<gpu::ThreadIdOp>(gpu::Dimension::x);
+    Value tidY = getDimOp<gpu::ThreadIdOp>(gpu::Dimension::y);
+    Value tidZ = getDimOp<gpu::ThreadIdOp>(gpu::Dimension::z);
     Value tmp1 = create<arith::MulIOp>(int32Type, tidZ, dimY);
     Value tmp2 = create<arith::AddIOp>(int32Type, tmp1, tidY);
     Value tmp3 = create<arith::MulIOp>(int32Type, tmp2, dimX);
@@ -150,8 +150,8 @@
 
   // Creates dimension op of type T, with the result casted to int32.
   template <typename T>
-  Value getDimOp(StringRef dimension) {
-    Value dim = create<T>(indexType, rewriter.getStringAttr(dimension));
+  Value getDimOp(gpu::Dimension dimension) {
+    Value dim = create<T>(indexType, dimension);
     return create<arith::IndexCastOp>(int32Type, dim);
   }
 
@@ -212,38 +212,34 @@
   }
 
   /// Returns an accumulator factory that creates an op specified by opName.
-  AccumulatorFactory getFactory(StringRef opName) {
+  AccumulatorFactory getFactory(gpu::AllReduceOperation opName) {
     bool isFloatingPoint = valueType.isa<FloatType>();
-    if (opName == "add")
+    switch (opName) {
+    case gpu::AllReduceOperation::ADD:
       return isFloatingPoint ? getFactory<arith::AddFOp>()
                              : getFactory<arith::AddIOp>();
-    if (opName == "mul")
+    case gpu::AllReduceOperation::MUL:
       return isFloatingPoint ? getFactory<arith::MulFOp>()
                              : getFactory<arith::MulIOp>();
-    if (opName == "and") {
+    case gpu::AllReduceOperation::AND:
       return getFactory<arith::AndIOp>();
-    }
-    if (opName == "or") {
+    case gpu::AllReduceOperation::OR:
       return getFactory<arith::OrIOp>();
-    }
-    if (opName == "xor") {
+    case gpu::AllReduceOperation::XOR:
       return getFactory<arith::XOrIOp>();
-    }
-    if (opName == "max") {
+    case gpu::AllReduceOperation::MAX:
       return isFloatingPoint
                  ? getCmpFactory<arith::CmpFOp, arith::CmpFPredicate,
                                  arith::CmpFPredicate::UGT>()
                  : getCmpFactory<arith::CmpIOp, arith::CmpIPredicate,
                                  arith::CmpIPredicate::ugt>();
-    }
-    if (opName == "min") {
+    case gpu::AllReduceOperation::MIN:
       return isFloatingPoint
                  ? getCmpFactory<arith::CmpFOp, arith::CmpFPredicate,
                                  arith::CmpFPredicate::ULT>()
                  : getCmpFactory<arith::CmpIOp, arith::CmpIPredicate,
                                  arith::CmpIPredicate::ult>();
     }
-    return AccumulatorFactory();
   }
 
   /// Returns an accumulator factory that creates an op of type T.
@@ -328,7 +324,6 @@
     Value isPartialSubgroup = create<arith::CmpIOp>(arith::CmpIPredicate::slt,
                                                     activeWidth, subgroupSize);
     std::array<Type, 2> shuffleType = {valueType, rewriter.getI1Type()};
-    auto xorAttr = rewriter.getStringAttr("xor");
 
     createIf(
         isPartialSubgroup,
@@ -340,8 +335,8 @@
           // in the first lane.
           for (int i = 1; i < kSubgroupSize; i <<= 1) {
             Value offset = create<arith::ConstantIntOp>(i, int32Type);
-            auto shuffleOp = create<gpu::ShuffleOp>(shuffleType, value, offset,
-                                                    activeWidth, xorAttr);
+            auto shuffleOp = create<gpu::ShuffleOp>(
+                shuffleType, value, offset, activeWidth, gpu::ShuffleMode::XOR);
             // Skip the accumulation if the shuffle op read from a lane outside
             // of the active range.
             createIf(
@@ -362,8 +357,9 @@
           Value value = operand;
           for (int i = 1; i < kSubgroupSize; i <<= 1) {
             Value offset = create<arith::ConstantIntOp>(i, int32Type);
-            auto shuffleOp = create<gpu::ShuffleOp>(shuffleType, value, offset,
-                                                    subgroupSize, xorAttr);
+            auto shuffleOp =
+                create<gpu::ShuffleOp>(shuffleType, value, offset, subgroupSize,
+                                       gpu::ShuffleMode::XOR);
             value = accumFactory(value, shuffleOp.getResult(0));
           }
           return SmallVector<Value, 1>{value};
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -30,11 +30,8 @@
 template <typename OpTy>
 static void createForAllDimensions(OpBuilder &builder, Location loc,
                                    SmallVectorImpl<Value> &values) {
-  for (StringRef dim : {"x", "y", "z"}) {
-    Value v = builder.create<OpTy>(loc, builder.getIndexType(),
-                                   builder.getStringAttr(dim));
-    values.push_back(v);
-  }
+  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
+    values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim));
 }
 
 /// Adds operations generating block/thread ids and grid/block dimensions at the
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -24,18 +24,6 @@
 using namespace mlir;
 using namespace mlir::gpu;
 
-/// Returns the textual name of a GPU dimension.
-static StringRef getDimName(unsigned dim) {
-  if (dim == 0)
-    return "x";
-  if (dim == 1)
-    return "y";
-  if (dim == 2)
-    return "z";
-
-  llvm_unreachable("dimension ID overflow");
-}
-
 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
 /// values using the bounds derived from the "from" value. Emits at least
 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
@@ -71,10 +59,9 @@
   // Obtain thread identifiers and block sizes, necessary to map to them.
   auto indexType = b.getIndexType();
   SmallVector<Value, 3> threadIds, blockDims;
-  for (unsigned i = 0; i < 3; ++i) {
-    auto dimName = b.getStringAttr(getDimName(i));
-    threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dimName));
-    blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dimName));
+  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
+    threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
+    blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
   }
 
   // Produce the loop nest with copies.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -18,9 +18,11 @@
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/OperationSupport.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
@@ -213,6 +215,10 @@
 #define GET_OP_LIST
 #include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
       >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc"
+      >();
 
   // Support unknown operations because not all NVVM operations are
   // registered.
@@ -233,3 +239,6 @@
 
 #define GET_OP_CLASSES
 #include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc"
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -12,8 +12,10 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 using namespace acc;
@@ -29,6 +31,10 @@
 #define GET_OP_LIST
 #include "mlir/Dialect/OpenACC/OpenACCOps.cpp.inc"
       >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/OpenACC/OpenACCOpsAttributes.cpp.inc"
+      >();
 }
 
 template <typename StructureOp>
@@ -894,3 +900,6 @@
 
 #define GET_OP_CLASSES
 #include "mlir/Dialect/OpenACC/OpenACCOps.cpp.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/OpenACC/OpenACCOpsAttributes.cpp.inc"
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/OperationSupport.h"
 
@@ -22,6 +23,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include <cstddef>
 
 #include "mlir/Dialect/OpenMP/OpenMPOpsDialect.cpp.inc"
@@ -47,6 +49,10 @@
 #define GET_OP_LIST
 #include "mlir/Dialect/OpenMP/OpenMPOps.cpp.inc"
       >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
+      >();
 
   LLVM::LLVMPointerType::attachInterface<
       PointerLikeModel<LLVM::LLVMPointerType>>(*getContext());
@@ -184,10 +190,10 @@
     printAllocateAndAllocator(p, op.allocate_vars(), op.allocators_vars());
 
   if (auto def = op.default_val())
-    p << "default(" << def->drop_front(3) << ") ";
+    p << "default(" << stringifyClauseDefault(*def).drop_front(3) << ") ";
 
   if (auto bind = op.proc_bind_val())
-    p << "proc_bind(" << bind << ") ";
+    p << "proc_bind(" << stringifyClauseProcBindKind(*bind) << ") ";
 
   p.printRegion(op.getRegion());
 }
@@ -261,19 +267,15 @@
   // If we have one modifier that is "simd", then stick a "none" modiifer in
   // index 0.
   if (modifiers.size() == 1) {
-    if (symbolizeScheduleModifier(modifiers[0]) ==
-        mlir::omp::ScheduleModifier::simd) {
+    if (symbolizeScheduleModifier(modifiers[0]) == ScheduleModifier::simd) {
       modifiers.push_back(modifiers[0]);
-      modifiers[0] =
-          stringifyScheduleModifier(mlir::omp::ScheduleModifier::none);
+      modifiers[0] = stringifyScheduleModifier(ScheduleModifier::none);
     }
   } else if (modifiers.size() == 2) {
     // If there are two modifier:
     // First modifier should not be simd, second one should be simd
-    if (symbolizeScheduleModifier(modifiers[0]) ==
-            mlir::omp::ScheduleModifier::simd ||
-        symbolizeScheduleModifier(modifiers[1]) !=
-            mlir::omp::ScheduleModifier::simd)
+    if (symbolizeScheduleModifier(modifiers[0]) == ScheduleModifier::simd ||
+        symbolizeScheduleModifier(modifiers[1]) != ScheduleModifier::simd)
       return parser.emitError(parser.getNameLoc())
              << " incorrect modifier order";
   }
@@ -333,15 +335,14 @@
 }
 
 /// Print schedule clause
-static void printScheduleClause(OpAsmPrinter &p, StringRef &sched,
-                                llvm::Optional<StringRef> modifier, bool simd,
+static void printScheduleClause(OpAsmPrinter &p, ClauseScheduleKind sched,
+                                Optional<ScheduleModifier> modifier, bool simd,
                                 Value scheduleChunkVar) {
-  std::string schedLower = sched.lower();
-  p << "schedule(" << schedLower;
+  p << "schedule(" << stringifyClauseScheduleKind(sched).lower();
   if (scheduleChunkVar)
     p << " = " << scheduleChunkVar;
-  if (modifier && modifier.hasValue())
-    p << ", " << modifier;
+  if (modifier)
+    p << ", " << stringifyScheduleModifier(*modifier);
   if (simd)
     p << ", simd";
   p << ") ";
@@ -545,6 +546,24 @@
 // Parser for Clause List
 //===----------------------------------------------------------------------===//
 
+/// Parse a clause attribute `(` $value `)`.
+template <typename ClauseAttr>
+ParseResult parseClauseAttr(AsmParser &parser, OperationState &state,
+                            StringRef attrName, StringRef name) {
+  using ClauseT = decltype(std::declval<ClauseAttr>().getValue());
+  StringRef enumStr;
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  if (parser.parseLParen() || parser.parseKeyword(&enumStr) ||
+      parser.parseRParen())
+    return failure();
+  if (Optional<ClauseT> enumValue = symbolizeEnum<ClauseT>(enumStr)) {
+    auto attr = ClauseAttr::get(parser.getContext(), *enumValue);
+    state.addAttribute(attrName, attr);
+    return success();
+  }
+  return parser.emitError(loc, "invalid ") << name << " kind";
+}
+
 /// Parse a list of clauses. The clauses can appear in any order, but their
 /// operand segment indices are in the same order that they are passed in the
 /// `clauses` list. The operand segments are added over the prevSegments
@@ -634,13 +653,12 @@
   SmallVector<int> clauseSegments(currPos);
 
   // Helper function to check if a clause is allowed/repeated or not
-  auto checkAllowed = [&](ClauseType clause,
-                          bool allowRepeat = false) -> ParseResult {
+  auto checkAllowed = [&](ClauseType clause) -> ParseResult {
     if (!llvm::is_contained(clauses, clause))
       return parser.emitError(parser.getCurrentLocation())
              << clauseKeyword << " is not a valid clause for the " << opName
              << " operation";
-    if (done[clause] && !allowRepeat)
+    if (done[clause])
       return parser.emitError(parser.getCurrentLocation())
              << "at most one " << clauseKeyword << " clause can appear on the "
              << opName << " operation";
@@ -695,20 +713,24 @@
       clauseSegments[pos[allocateClause] + 1] = allocators.size();
     } else if (clauseKeyword == "default") {
       StringRef defval;
+      llvm::SMLoc loc = parser.getCurrentLocation();
       if (checkAllowed(defaultClause) || parser.parseLParen() ||
           parser.parseKeyword(&defval) || parser.parseRParen())
         return failure();
       // The def prefix is required for the attribute as "private" is a keyword
       // in C++.
-      auto attr = parser.getBuilder().getStringAttr("def" + defval);
-      result.addAttribute("default_val", attr);
+      if (Optional<ClauseDefault> def =
+              symbolizeClauseDefault(("def" + defval).str())) {
+        result.addAttribute("default_val",
+                            ClauseDefaultAttr::get(parser.getContext(), *def));
+      } else {
+        return parser.emitError(loc, "invalid default clause");
+      }
     } else if (clauseKeyword == "proc_bind") {
-      StringRef bind;
-      if (checkAllowed(procBindClause) || parser.parseLParen() ||
-          parser.parseKeyword(&bind) || parser.parseRParen())
+      if (checkAllowed(procBindClause) ||
+          parseClauseAttr<ClauseProcBindKindAttr>(parser, result,
+                                                  "proc_bind_val", "proc bind"))
         return failure();
-      auto attr = parser.getBuilder().getStringAttr(bind);
-      result.addAttribute("proc_bind_val", attr);
     } else if (clauseKeyword == "reduction") {
       if (checkAllowed(reductionClause) ||
           parseReductionVarList(parser, reductionSymbols, reductionVars,
@@ -754,19 +776,15 @@
       }
       result.addAttribute("ordered_val", attr);
     } else if (clauseKeyword == "order") {
-      StringRef order;
-      if (checkAllowed(orderClause) || parser.parseLParen() ||
-          parser.parseKeyword(&order) || parser.parseRParen())
+      if (checkAllowed(orderClause) ||
+          parseClauseAttr<ClauseOrderKindAttr>(parser, result, "order_val",
+                                               "order"))
         return failure();
-      auto attr = parser.getBuilder().getStringAttr(order);
-      result.addAttribute("order_val", attr);
     } else if (clauseKeyword == "memory_order") {
-      StringRef memoryOrder;
-      if (checkAllowed(memoryOrderClause) || parser.parseLParen() ||
-          parser.parseKeyword(&memoryOrder) || parser.parseRParen())
+      if (checkAllowed(memoryOrderClause) ||
+          parseClauseAttr<ClauseMemoryOrderKindAttr>(
+              parser, result, "memory_order", "memory order"))
         return failure();
-      result.addAttribute("memory_order",
-                          parser.getBuilder().getStringAttr(memoryOrder));
     } else if (clauseKeyword == "hint") {
       IntegerAttr hint;
       if (checkAllowed(hintClause) ||
@@ -861,15 +879,28 @@
   // Add schedule parameters
   if (done[scheduleClause] && !schedule.empty()) {
     schedule[0] = llvm::toUpper(schedule[0]);
-    auto attr = parser.getBuilder().getStringAttr(schedule);
-    result.addAttribute("schedule_val", attr);
+    if (Optional<ClauseScheduleKind> sched =
+            symbolizeClauseScheduleKind(schedule)) {
+      auto attr = ClauseScheduleKindAttr::get(parser.getContext(), *sched);
+      result.addAttribute("schedule_val", attr);
+    } else {
+      return parser.emitError(parser.getCurrentLocation(),
+                              "invalid schedule kind");
+    }
     if (!modifiers.empty()) {
-      auto mod = parser.getBuilder().getStringAttr(modifiers[0]);
-      result.addAttribute("schedule_modifier", mod);
+      llvm::SMLoc loc = parser.getCurrentLocation();
+      if (Optional<ScheduleModifier> mod =
+              symbolizeScheduleModifier(modifiers[0])) {
+        result.addAttribute(
+            "schedule_modifier",
+            ScheduleModifierAttr::get(parser.getContext(), *mod));
+      } else {
+        return parser.emitError(loc, "invalid schedule modifier");
+      }
       // Only SIMD attribute is allowed here!
       if (modifiers.size() > 1) {
         assert(symbolizeScheduleModifier(modifiers[1]) ==
-               mlir::omp::ScheduleModifier::simd);
+               ScheduleModifier::simd);
         auto attr = UnitAttr::get(parser.getBuilder().getContext());
         result.addAttribute("simd_modifier", attr);
       }
@@ -1089,7 +1120,7 @@
     p << "ordered(" << ordered << ") ";
 
   if (auto order = op.order_val())
-    p << "order(" << order << ") ";
+    p << "order(" << stringifyClauseOrderKind(*order) << ") ";
 
   if (!op.reduction_vars().empty())
     printReductionVarList(p, op.reductions(), op.reduction_vars());
@@ -1351,8 +1382,8 @@
 /// Printer for AtomicReadOp
 static void printAtomicReadOp(OpAsmPrinter &p, AtomicReadOp op) {
   p << " " << op.v() << " = " << op.x() << " ";
-  if (op.memory_order())
-    p << "memory_order(" << op.memory_order().getValue() << ") ";
+  if (auto mo = op.memory_order())
+    p << "memory_order(" << stringifyClauseMemoryOrderKind(*mo) << ") ";
   if (op.hintAttr())
     printSynchronizationHint(p << " ", op, op.hintAttr());
   p << ": " << op.x().getType();
@@ -1360,11 +1391,12 @@
 
 /// Verifier for AtomicReadOp
 static LogicalResult verifyAtomicReadOp(AtomicReadOp op) {
-  if (op.memory_order()) {
-    StringRef memOrder = op.memory_order().getValue();
-    if (memOrder.equals("acq_rel") || memOrder.equals("release"))
+  if (auto mo = op.memory_order()) {
+    if (*mo == ClauseMemoryOrderKind::acq_rel ||
+        *mo == ClauseMemoryOrderKind::release) {
       return op.emitError(
           "memory-order must not be acq_rel or release for atomic reads");
+    }
   }
   if (op.x() == op.v())
     return op.emitError(
@@ -1403,8 +1435,8 @@
 /// Printer for AtomicWriteOp
 static void printAtomicWriteOp(OpAsmPrinter &p, AtomicWriteOp op) {
   p << " " << op.address() << " = " << op.value() << " ";
-  if (op.memory_order())
-    p << "memory_order(" << op.memory_order() << ") ";
+  if (auto mo = op.memory_order())
+    p << "memory_order(" << stringifyClauseMemoryOrderKind(*mo) << ") ";
   if (op.hintAttr())
     printSynchronizationHint(p, op, op.hintAttr());
   p << ": " << op.address().getType() << ", " << op.value().getType();
@@ -1412,11 +1444,12 @@
 
 /// Verifier for AtomicWriteOp
 static LogicalResult verifyAtomicWriteOp(AtomicWriteOp op) {
-  if (op.memory_order()) {
-    StringRef memoryOrder = op.memory_order().getValue();
-    if (memoryOrder.equals("acq_rel") || memoryOrder.equals("acquire"))
+  if (auto mo = op.memory_order()) {
+    if (*mo == ClauseMemoryOrderKind::acq_rel ||
+        *mo == ClauseMemoryOrderKind::acquire) {
       return op.emitError(
           "memory-order must not be acq_rel or acquire for atomic writes");
+    }
   }
   return verifySynchronizationHint(op, op.hint());
 }
@@ -1482,8 +1515,8 @@
   }
   p << y << " " << AtomicBinOpKindToString(op.binop()).lower() << " " << z
     << " ";
-  if (op.memory_order())
-    p << "memory_order(" << op.memory_order() << ") ";
+  if (auto mo = op.memory_order())
+    p << "memory_order(" << stringifyClauseMemoryOrderKind(*mo) << ") ";
   if (op.hintAttr())
     printSynchronizationHint(p, op, op.hintAttr());
   p << ": " << op.x().getType() << ", " << op.expr().getType();
@@ -1491,14 +1524,18 @@
 
 /// Verifier for AtomicUpdateOp
 static LogicalResult verifyAtomicUpdateOp(AtomicUpdateOp op) {
-  if (op.memory_order()) {
-    StringRef memoryOrder = op.memory_order().getValue();
-    if (memoryOrder.equals("acq_rel") || memoryOrder.equals("acquire"))
+  if (auto mo = op.memory_order()) {
+    if (*mo == ClauseMemoryOrderKind::acq_rel ||
+        *mo == ClauseMemoryOrderKind::acquire) {
       return op.emitError(
           "memory-order must not be acq_rel or acquire for atomic updates");
+    }
   }
   return success();
 }
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOps.cpp.inc"
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -186,9 +186,23 @@
   moduleTranslation.forgetMapping(region);
 }
 
+// Convert ProcBindKinf from MLIR-generated enum to LLVM enum.
+static llvm::omp::ProcBindKind getProcBindKind(omp::ClauseProcBindKind kind) {
+  switch (kind) {
+  case omp::ClauseProcBindKind::close:
+    return llvm::omp::ProcBindKind::OMP_PROC_BIND_close;
+  case omp::ClauseProcBindKind::master:
+    return llvm::omp::ProcBindKind::OMP_PROC_BIND_master;
+  case omp::ClauseProcBindKind::primary:
+    return llvm::omp::ProcBindKind::OMP_PROC_BIND_primary;
+  case omp::ClauseProcBindKind::spread:
+    return llvm::omp::ProcBindKind::OMP_PROC_BIND_spread;
+  }
+}
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
-convertOmpParallel(Operation &opInst, llvm::IRBuilderBase &builder,
+convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
@@ -203,10 +217,9 @@
         moduleTranslation, allocaIP);
 
     // ParallelOp has only one region associated with it.
-    auto &region = cast<omp::ParallelOp>(opInst).getRegion();
-    convertOmpOpRegions(region, "omp.par.region", *codeGenIP.getBlock(),
-                        continuationBlock, builder, moduleTranslation,
-                        bodyGenStatus);
+    convertOmpOpRegions(opInst.getRegion(), "omp.par.region",
+                        *codeGenIP.getBlock(), continuationBlock, builder,
+                        moduleTranslation, bodyGenStatus);
   };
 
   // TODO: Perform appropriate actions according to the data-sharing
@@ -225,14 +238,14 @@
   auto finiCB = [&](InsertPointTy codeGenIP) {};
 
   llvm::Value *ifCond = nullptr;
-  if (auto ifExprVar = cast<omp::ParallelOp>(opInst).if_expr_var())
+  if (auto ifExprVar = opInst.if_expr_var())
     ifCond = moduleTranslation.lookupValue(ifExprVar);
   llvm::Value *numThreads = nullptr;
-  if (auto numThreadsVar = cast<omp::ParallelOp>(opInst).num_threads_var())
+  if (auto numThreadsVar = opInst.num_threads_var())
     numThreads = moduleTranslation.lookupValue(numThreadsVar);
-  llvm::omp::ProcBindKind pbKind = llvm::omp::OMP_PROC_BIND_default;
-  if (auto bind = cast<omp::ParallelOp>(opInst).proc_bind_val())
-    pbKind = llvm::omp::getProcBindKind(bind.getValue());
+  auto pbKind = llvm::omp::OMP_PROC_BIND_default;
+  if (auto bind = opInst.proc_bind_val())
+    pbKind = getProcBindKind(*bind);
   // TODO: Is the Parallel construct cancellable?
   bool isCancellable = false;
 
@@ -486,8 +499,7 @@
                   LLVM::ModuleTranslation &moduleTranslation) {
   auto orderedOp = cast<omp::OrderedOp>(opInst);
 
-  omp::ClauseDepend dependType =
-      *omp::symbolizeClauseDepend(orderedOp.depend_type_valAttr().getValue());
+  omp::ClauseDepend dependType = *orderedOp.depend_type_val();
   bool isDependSource = dependType == omp::ClauseDepend::dependsource;
   unsigned numLoops = orderedOp.num_loops_val().getValue();
   SmallVector<llvm::Value *> vecValues =
@@ -628,10 +640,8 @@
     return failure();
 
   // Static is the default.
-  omp::ClauseScheduleKind schedule = omp::ClauseScheduleKind::Static;
-  if (loop.schedule_val().hasValue())
-    schedule =
-        *omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue());
+  auto schedule =
+      loop.schedule_val().getValueOr(omp::ClauseScheduleKind::Static);
 
   // Find the loop configuration.
   llvm::Value *step = moduleTranslation.lookupValue(loop.step()[0]);
@@ -788,10 +798,8 @@
       break;
     }
 
-    if (loop.schedule_modifier().hasValue()) {
-      omp::ScheduleModifier modifier =
-          *omp::symbolizeScheduleModifier(loop.schedule_modifier().getValue());
-      switch (modifier) {
+    if (Optional<omp::ScheduleModifier> modifier = loop.schedule_modifier()) {
+      switch (*modifier) {
       case omp::ScheduleModifier::monotonic:
         schedType |= llvm::omp::OMPScheduleType::ModifierMonotonic;
         break;
@@ -861,17 +869,23 @@
 }
 
 // Convert an Atomic Ordering attribute to llvm::AtomicOrdering.
-llvm::AtomicOrdering convertAtomicOrdering(Optional<StringRef> aoAttr) {
-  if (!aoAttr.hasValue())
+llvm::AtomicOrdering
+convertAtomicOrdering(Optional<omp::ClauseMemoryOrderKind> ao) {
+  if (!ao)
     return llvm::AtomicOrdering::Monotonic; // Default Memory Ordering
 
-  return StringSwitch<llvm::AtomicOrdering>(aoAttr.getValue())
-      .Case("seq_cst", llvm::AtomicOrdering::SequentiallyConsistent)
-      .Case("acq_rel", llvm::AtomicOrdering::AcquireRelease)
-      .Case("acquire", llvm::AtomicOrdering::Acquire)
-      .Case("release", llvm::AtomicOrdering::Release)
-      .Case("relaxed", llvm::AtomicOrdering::Monotonic)
-      .Default(llvm::AtomicOrdering::Monotonic);
+  switch (*ao) {
+  case omp::ClauseMemoryOrderKind::seq_cst:
+    return llvm::AtomicOrdering::SequentiallyConsistent;
+  case omp::ClauseMemoryOrderKind::acq_rel:
+    return llvm::AtomicOrdering::AcquireRelease;
+  case omp::ClauseMemoryOrderKind::acquire:
+    return llvm::AtomicOrdering::Acquire;
+  case omp::ClauseMemoryOrderKind::release:
+    return llvm::AtomicOrdering::Release;
+  case omp::ClauseMemoryOrderKind::relaxed:
+    return llvm::AtomicOrdering::Monotonic;
+  }
 }
 
 // Convert omp.atomic.read operation to LLVM IR.
@@ -1021,8 +1035,8 @@
         ompBuilder->createFlush(builder.saveIP());
         return success();
       })
-      .Case([&](omp::ParallelOp) {
-        return convertOmpParallel(*op, builder, moduleTranslation);
+      .Case([&](omp::ParallelOp op) {
+        return convertOmpParallel(op, builder, moduleTranslation);
       })
       .Case([&](omp::ReductionOp reductionOp) {
         return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -11,43 +11,43 @@
 
     // CHECK: = nvvm.read.ptx.sreg.tid.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+    %tIdX = gpu.thread_id x
     // CHECK: = nvvm.read.ptx.sreg.tid.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+    %tIdY = gpu.thread_id y
     // CHECK: = nvvm.read.ptx.sreg.tid.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+    %tIdZ = gpu.thread_id z
 
     // CHECK: = nvvm.read.ptx.sreg.ntid.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+    %bDimX = gpu.block_dim x
     // CHECK: = nvvm.read.ptx.sreg.ntid.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+    %bDimY = gpu.block_dim y
     // CHECK: = nvvm.read.ptx.sreg.ntid.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+    %bDimZ = gpu.block_dim z
 
     // CHECK: = nvvm.read.ptx.sreg.ctaid.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+    %bIdX = gpu.block_id x
     // CHECK: = nvvm.read.ptx.sreg.ctaid.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+    %bIdY = gpu.block_id y
     // CHECK: = nvvm.read.ptx.sreg.ctaid.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+    %bIdZ = gpu.block_id z
 
     // CHECK: = nvvm.read.ptx.sreg.nctaid.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+    %gDimX = gpu.grid_dim x
     // CHECK: = nvvm.read.ptx.sreg.nctaid.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+    %gDimY = gpu.grid_dim y
     // CHECK: = nvvm.read.ptx.sreg.nctaid.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+    %gDimZ = gpu.grid_dim z
 
     std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
                %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ
@@ -78,10 +78,10 @@
   gpu.func @gpu_all_reduce_op() {
     %arg0 = arith.constant 1.0 : f32
     // TODO: Check full IR expansion once lowering has settled.
-    // CHECK: nvvm.shfl.sync "bfly" {{.*}}
+    // CHECK: nvvm.shfl.sync bfly {{.*}}
     // CHECK: nvvm.barrier0
     // CHECK: llvm.fadd
-    %result = "gpu.all_reduce"(%arg0) ({}) {op = "add"} : (f32) -> (f32)
+    %result = gpu.all_reduce add %arg0 {} : (f32) -> (f32)
 
     gpu.return
   }
@@ -94,13 +94,13 @@
   gpu.func @gpu_all_reduce_region() {
     %arg0 = arith.constant 1 : i32
     // TODO: Check full IR expansion once lowering has settled.
-    // CHECK: nvvm.shfl.sync "bfly" {{.*}}
+    // CHECK: nvvm.shfl.sync bfly {{.*}}
     // CHECK: nvvm.barrier0
-    %result = "gpu.all_reduce"(%arg0) ({
+    %result = gpu.all_reduce %arg0 {
     ^bb(%lhs : i32, %rhs : i32):
       %xor = arith.xori %lhs, %rhs : i32
       "gpu.yield"(%xor) : (i32) -> ()
-    }) : (i32) -> (i32)
+    } : (i32) -> (i32)
     gpu.return
   }
 }
@@ -120,16 +120,16 @@
     // CHECK: %[[#SHL:]] = llvm.shl %[[#ONE]], %[[#WIDTH]] : i32
     // CHECK: %[[#MASK:]] = llvm.sub %[[#SHL]], %[[#ONE]] : i32
     // CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : i32
-    // CHECK: %[[#SHFL:]] = nvvm.shfl.sync "bfly" %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+    // CHECK: %[[#SHFL:]] = nvvm.shfl.sync bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
     // CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm.struct<(f32, i1)>
     // CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm.struct<(f32, i1)>
-    %shfl, %pred = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "xor" } : (f32, i32, i32) -> (f32, i1)
-    // CHECK: nvvm.shfl.sync "up" {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
-    %shflu, %predu = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "up" } : (f32, i32, i32) -> (f32, i1)
-    // CHECK: nvvm.shfl.sync "down" {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
-    %shfld, %predd = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "down" } : (f32, i32, i32) -> (f32, i1)
-    // CHECK: nvvm.shfl.sync "idx" {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
-    %shfli, %predi = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "idx" } : (f32, i32, i32) -> (f32, i1)
+    %shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
+    // CHECK: nvvm.shfl.sync up {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+    %shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32
+    // CHECK: nvvm.shfl.sync down {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+    %shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
+    // CHECK: nvvm.shfl.sync idx {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
 
     std.return %shfl, %shflu, %shfld, %shfli : f32, f32,f32, f32
   }
diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
--- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
@@ -20,7 +20,7 @@
     // CHECK:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
     // CHECK:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
     // CHECK:  %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
-    // CHECK-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+    // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
     // CHECK32:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
@@ -32,7 +32,7 @@
     // CHECK32:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
     // CHECK32:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
     // CHECK32:  %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
-    // CHECK32-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+    // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK32:  llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     return %0 : !gpu.mma_matrix<16x16xf16, "AOp">
   }
@@ -64,7 +64,7 @@
     // CHECK:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
     // CHECK:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
     // CHECK:  nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
-    // CHECK-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
+    // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
     // CHECK:  llvm.return
 
     // CHECK32:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
@@ -80,7 +80,7 @@
     // CHECK32:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
     // CHECK32:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
     // CHECK32:  nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
-    // CHECK32-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
+    // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
     // CHECK32:  llvm.return
     return
   }
@@ -115,7 +115,7 @@
     // CHECK:  %[[C3:.*]] = llvm.extractvalue %[[C]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[C4:.*]] = llvm.extractvalue %[[C]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[RES:.*]] = nvvm.wmma.mma %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[A8]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[B8]], %[[C1]], %[[C2]], %[[C3]], %[[C4]]
-    // CHECK-SAME: {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32} : (
+    // CHECK-SAME: {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f16>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (
     // CHECK-SAME: vector<2xf16>, {{.*}}) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  llvm.return %[[RES]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     return %D : !gpu.mma_matrix<16x16xf16, "COp">
@@ -127,13 +127,13 @@
 gpu.module @test_module {
 
 // CHECK-LABEL: func @gpu_wmma_mma_loop_op
-//       CHECK:   %[[C:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = "f16", frag = "c", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[C:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
 //       CHECK:  ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>):  // 2 preds: ^bb0, ^bb2
 //       CHECK:   llvm.cond_br %{{.*}}, ^bb2, ^bb3
 //       CHECK:  ^bb2:  // pred: ^bb1
-//       CHECK:   %[[A:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-//       CHECK:   %[[B:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = "f16", frag = "b", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[A:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[B:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[A0:.+]] = llvm.extractvalue %[[A]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[A1:.+]] = llvm.extractvalue %[[A]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[A2:.+]] = llvm.extractvalue %[[A]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
@@ -154,14 +154,14 @@
 //       CHECK:   %[[ACC1:.+]] = llvm.extractvalue %[[ACC]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[ACC2:.+]] = llvm.extractvalue %[[ACC]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[ACC3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-//       CHECK:   %[[ACC_MUL:.+]] = nvvm.wmma.mma %[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[B0]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[ACC0]], %[[ACC1]], %[[ACC2]], %[[ACC3]] {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32} : (vector<2xf16>, {{.*}} -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[ACC_MUL:.+]] = nvvm.wmma.mma %[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[B0]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[ACC0]], %[[ACC1]], %[[ACC2]], %[[ACC3]] {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f16>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (vector<2xf16>, {{.*}} -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   llvm.br ^bb1(%{{.*}}, %[[ACC_MUL]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
 //       CHECK:  ^bb3:  // pred: ^bb1
 //       CHECK:   %[[E0:.+]] = llvm.extractvalue %[[ACC]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[E1:.+]] = llvm.extractvalue %[[ACC]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[E2:.+]] = llvm.extractvalue %[[ACC]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[E3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-//       CHECK:   nvvm.wmma.store %{{.*}}, %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
+//       CHECK:   nvvm.wmma.store %{{.*}}, %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
 
   builtin.func @gpu_wmma_mma_loop_op(%arg0: memref<128x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<128x128xf16>) {
       %c0 = arith.constant 0 : index
@@ -233,8 +233,7 @@
 //       CHECK: %[[M4:.*]] = llvm.insertvalue %[[C3]], %[[M3]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK: llvm.return %[[M4]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
   builtin.func @gpu_wmma_elementwise(%A : !gpu.mma_matrix<16x16xf16, "COp">, %B : !gpu.mma_matrix<16x16xf16, "COp">)  ->(!gpu.mma_matrix<16x16xf16, "COp">) {
-    %C = gpu.subgroup_mma_elementwise %A, %B { operation = "ADDF" } :
-      (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
+    %C = gpu.subgroup_mma_elementwise addf %A, %B : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
     return %C : !gpu.mma_matrix<16x16xf16, "COp">
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -11,43 +11,43 @@
 
     // CHECK: rocdl.workitem.id.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+    %tIdX = gpu.thread_id x
     // CHECK: rocdl.workitem.id.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+    %tIdY = gpu.thread_id y
     // CHECK: rocdl.workitem.id.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+    %tIdZ = gpu.thread_id z
 
     // CHECK: rocdl.workgroup.dim.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+    %bDimX = gpu.block_dim x
     // CHECK: rocdl.workgroup.dim.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+    %bDimY = gpu.block_dim y
     // CHECK: rocdl.workgroup.dim.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+    %bDimZ = gpu.block_dim z
 
     // CHECK: rocdl.workgroup.id.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+    %bIdX = gpu.block_id x
     // CHECK: rocdl.workgroup.id.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+    %bIdY = gpu.block_id y
     // CHECK: rocdl.workgroup.id.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+    %bIdZ = gpu.block_id z
 
     // CHECK: rocdl.grid.dim.x : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+    %gDimX = gpu.grid_dim x
     // CHECK: rocdl.grid.dim.y : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+    %gDimY = gpu.grid_dim y
     // CHECK: rocdl.grid.dim.z : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
-    %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+    %gDimZ = gpu.grid_dim z
 
     std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
                %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ
diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
@@ -16,7 +16,7 @@
       // CHECK: [[ADDRESS:%.*]] = spv.mlir.addressof [[WORKGROUPID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      %0 = gpu.block_id x
       gpu.return
     }
   }
@@ -42,7 +42,7 @@
       // CHECK: [[ADDRESS:%.*]] = spv.mlir.addressof [[WORKGROUPID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}1 : i32{{\]}}
-      %0 = "gpu.block_id"() {dimension = "y"} : () -> index
+      %0 = gpu.block_id y
       gpu.return
     }
   }
@@ -66,7 +66,7 @@
       // CHECK: [[ADDRESS:%.*]] = spv.mlir.addressof [[WORKGROUPID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}2 : i32{{\]}}
-      %0 = "gpu.block_id"() {dimension = "z"} : () -> index
+      %0 = gpu.block_id z
       gpu.return
     }
   }
@@ -91,7 +91,7 @@
       // We may want to define gpu.workgroup_size and convert it to the entry
       // point ABI we want here.
       // CHECK: spv.Constant 32 : i32
-      %0 = "gpu.block_dim"() {dimension = "x"} : () -> index
+      %0 = gpu.block_dim x
       gpu.return
     }
   }
@@ -113,7 +113,7 @@
       attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]>: vector<3xi32>}} {
       // The constant value is obtained from the spv.entry_point_abi.
       // CHECK: spv.Constant 4 : i32
-      %0 = "gpu.block_dim"() {dimension = "y"} : () -> index
+      %0 = gpu.block_dim y
       gpu.return
     }
   }
@@ -135,7 +135,7 @@
       attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]>: vector<3xi32>}} {
       // The constant value is obtained from the spv.entry_point_abi.
       // CHECK: spv.Constant 1 : i32
-      %0 = "gpu.block_dim"() {dimension = "z"} : () -> index
+      %0 = gpu.block_dim z
       gpu.return
     }
   }
@@ -159,7 +159,7 @@
       // CHECK: [[ADDRESS:%.*]] = spv.mlir.addressof [[LOCALINVOCATIONID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-      %0 = "gpu.thread_id"() {dimension = "x"} : () -> index
+      %0 = gpu.thread_id x
       gpu.return
     }
   }
@@ -183,7 +183,7 @@
       // CHECK: [[ADDRESS:%.*]] = spv.mlir.addressof [[NUMWORKGROUPS]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-      %0 = "gpu.grid_dim"() {dimension = "x"} : () -> index
+      %0 = gpu.grid_dim x
       gpu.return
     }
   }
diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -43,18 +43,18 @@
       // CHECK: %[[ADDRESSLOCALINVOCATIONID:.*]] = spv.mlir.addressof @[[$LOCALINVOCATIONIDVAR]]
       // CHECK: %[[LOCALINVOCATIONID:.*]] = spv.Load "Input" %[[ADDRESSLOCALINVOCATIONID]]
       // CHECK: %[[LOCALINVOCATIONIDX:.*]] = spv.CompositeExtract %[[LOCALINVOCATIONID]]{{\[}}0 : i32{{\]}}
-      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
-      %1 = "gpu.block_id"() {dimension = "y"} : () -> index
-      %2 = "gpu.block_id"() {dimension = "z"} : () -> index
-      %3 = "gpu.thread_id"() {dimension = "x"} : () -> index
-      %4 = "gpu.thread_id"() {dimension = "y"} : () -> index
-      %5 = "gpu.thread_id"() {dimension = "z"} : () -> index
-      %6 = "gpu.grid_dim"() {dimension = "x"} : () -> index
-      %7 = "gpu.grid_dim"() {dimension = "y"} : () -> index
-      %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
-      %9 = "gpu.block_dim"() {dimension = "x"} : () -> index
-      %10 = "gpu.block_dim"() {dimension = "y"} : () -> index
-      %11 = "gpu.block_dim"() {dimension = "z"} : () -> index
+      %0 = gpu.block_id x
+      %1 = gpu.block_id y
+      %2 = gpu.block_id z
+      %3 = gpu.thread_id x
+      %4 = gpu.thread_id y
+      %5 = gpu.thread_id z
+      %6 = gpu.grid_dim x
+      %7 = gpu.grid_dim y
+      %8 = gpu.grid_dim z
+      %9 = gpu.block_dim x
+      %10 = gpu.block_dim y
+      %11 = gpu.block_dim z
       // CHECK: %[[INDEX1:.*]] = spv.IAdd %[[ARG3]], %[[WORKGROUPIDX]]
       %12 = arith.addi %arg3, %0 : index
       // CHECK: %[[INDEX2:.*]] = spv.IAdd %[[ARG4]], %[[LOCALINVOCATIONIDX]]
diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
--- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
+++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
@@ -92,7 +92,7 @@
 //   CHECK-DAG:   %[[C0:.+]] = gpu.subgroup_mma_constant_matrix %[[CST_0]] : !gpu.mma_matrix<16x16xf16, "COp">
 //   CHECK-DAG:   %[[C1:.+]] = gpu.subgroup_mma_constant_matrix %[[CST_1]] : !gpu.mma_matrix<16x16xf16, "COp">
 //       CHECK:   %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C0]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
-//       CHECK:   %[[E:.+]] = gpu.subgroup_mma_elementwise %[[D]], %[[C1]] {operation = "ADDF"} : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
+//       CHECK:   %[[E:.+]] = gpu.subgroup_mma_elementwise addf %[[D]], %[[C1]] : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
 //       CHECK:   gpu.subgroup_mma_store_matrix %[[E]], %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>
 func @matmul_fused_elementwise(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) {
   %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
@@ -114,7 +114,7 @@
 //   CHECK-DAG:   %[[C0:.+]] = gpu.subgroup_mma_constant_matrix %[[CST_0]] : !gpu.mma_matrix<16x16xf16, "COp">
 //       CHECK:   %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C0]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
 //       CHECK:   %[[E:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] {leadDimension = 0 : index} : memref<16x16x16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
-//       CHECK:   %[[F:.+]] = gpu.subgroup_mma_elementwise %[[D]], %[[E]] {operation = "DIVF"} : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
+//       CHECK:   %[[F:.+]] = gpu.subgroup_mma_elementwise divf %[[D]], %[[E]] : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
 //       CHECK:   gpu.subgroup_mma_store_matrix %[[F]], %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>
 func @matmul_fused_broadcast(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>,
   %arg2: memref<16x16xf16>, %arg3: memref<16x16x16x16xf16>) {
diff --git a/mlir/test/Dialect/GPU/all-reduce-max.mlir b/mlir/test/Dialect/GPU/all-reduce-max.mlir
--- a/mlir/test/Dialect/GPU/all-reduce-max.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce-max.mlir
@@ -16,17 +16,17 @@
     // CHECK-DAG:   [[VAL_8:%.*]] = arith.constant 4 : i32
     // CHECK-DAG:   [[VAL_9:%.*]] = arith.constant 8 : i32
     // CHECK-DAG:   [[VAL_10:%.*]] = arith.constant 16 : i32
-    // CHECK:   [[VAL_11:%.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
+    // CHECK:   [[VAL_11:%.*]] = gpu.block_dim x
     // CHECK:   [[VAL_12:%.*]] = arith.index_cast [[VAL_11]] : index to i32
-    // CHECK:   [[VAL_13:%.*]] = "gpu.block_dim"() {dimension = "y"} : () -> index
+    // CHECK:   [[VAL_13:%.*]] = gpu.block_dim y
     // CHECK:   [[VAL_14:%.*]] = arith.index_cast [[VAL_13]] : index to i32
-    // CHECK:   [[VAL_15:%.*]] = "gpu.block_dim"() {dimension = "z"} : () -> index
+    // CHECK:   [[VAL_15:%.*]] = gpu.block_dim z
     // CHECK:   [[VAL_16:%.*]] = arith.index_cast [[VAL_15]] : index to i32
-    // CHECK:   [[VAL_17:%.*]] = "gpu.thread_id"() {dimension = "x"} : () -> index
+    // CHECK:   [[VAL_17:%.*]] = gpu.thread_id x
     // CHECK:   [[VAL_18:%.*]] = arith.index_cast [[VAL_17]] : index to i32
-    // CHECK:   [[VAL_19:%.*]] = "gpu.thread_id"() {dimension = "y"} : () -> index
+    // CHECK:   [[VAL_19:%.*]] = gpu.thread_id y
     // CHECK:   [[VAL_20:%.*]] = arith.index_cast [[VAL_19]] : index to i32
-    // CHECK:   [[VAL_21:%.*]] = "gpu.thread_id"() {dimension = "z"} : () -> index
+    // CHECK:   [[VAL_21:%.*]] = gpu.thread_id z
     // CHECK:   [[VAL_22:%.*]] = arith.index_cast [[VAL_21]] : index to i32
     // CHECK:   [[VAL_23:%.*]] = arith.muli [[VAL_22]], [[VAL_14]] : i32
     // CHECK:   [[VAL_24:%.*]] = arith.addi [[VAL_23]], [[VAL_20]] : i32
@@ -41,7 +41,7 @@
     // CHECK:   [[VAL_33:%.*]] = arith.cmpi slt, [[VAL_32]], [[VAL_5]] : i32
     // CHECK:   cond_br [[VAL_33]], ^bb1, ^bb17
     // CHECK: ^bb1:
-    // CHECK:   [[VAL_34:%.*]], [[VAL_35:%.*]] = gpu.shuffle [[VAL_0]], [[VAL_6]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_34:%.*]], [[VAL_35:%.*]] = gpu.shuffle xor [[VAL_0]], [[VAL_6]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_35]], ^bb2, ^bb3
     // CHECK: ^bb2:
     // CHECK:   [[VAL_36:%.*]] = arith.cmpf ugt, [[VAL_0]], [[VAL_34]] : f32
@@ -50,7 +50,7 @@
     // CHECK: ^bb3:
     // CHECK:   br ^bb4([[VAL_0]] : f32)
     // CHECK: ^bb4([[VAL_38:%.*]]: f32):
-    // CHECK:   [[VAL_39:%.*]], [[VAL_40:%.*]] = gpu.shuffle [[VAL_38]], [[VAL_7]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_39:%.*]], [[VAL_40:%.*]] = gpu.shuffle xor [[VAL_38]], [[VAL_7]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_40]], ^bb5, ^bb6
     // CHECK: ^bb5:
     // CHECK:   [[VAL_41:%.*]] = arith.cmpf ugt, [[VAL_38]], [[VAL_39]] : f32
@@ -59,7 +59,7 @@
     // CHECK: ^bb6:
     // CHECK:   br ^bb7([[VAL_38]] : f32)
     // CHECK: ^bb7([[VAL_43:%.*]]: f32):
-    // CHECK:   [[VAL_44:%.*]], [[VAL_45:%.*]] = gpu.shuffle [[VAL_43]], [[VAL_8]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_44:%.*]], [[VAL_45:%.*]] = gpu.shuffle xor [[VAL_43]], [[VAL_8]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_45]], ^bb8, ^bb9
     // CHECK: ^bb8:
     // CHECK:   [[VAL_46:%.*]] = arith.cmpf ugt, [[VAL_43]], [[VAL_44]] : f32
@@ -68,7 +68,7 @@
     // CHECK: ^bb9:
     // CHECK:   br ^bb10([[VAL_43]] : f32)
     // CHECK: ^bb10([[VAL_48:%.*]]: f32):
-    // CHECK:   [[VAL_49:%.*]], [[VAL_50:%.*]] = gpu.shuffle [[VAL_48]], [[VAL_9]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_49:%.*]], [[VAL_50:%.*]] = gpu.shuffle xor [[VAL_48]], [[VAL_9]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_50]], ^bb11, ^bb12
     // CHECK: ^bb11:
     // CHECK:   [[VAL_51:%.*]] = arith.cmpf ugt, [[VAL_48]], [[VAL_49]] : f32
@@ -77,7 +77,7 @@
     // CHECK: ^bb12:
     // CHECK:   br ^bb13([[VAL_48]] : f32)
     // CHECK: ^bb13([[VAL_53:%.*]]: f32):
-    // CHECK:   [[VAL_54:%.*]], [[VAL_55:%.*]] = gpu.shuffle [[VAL_53]], [[VAL_10]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_54:%.*]], [[VAL_55:%.*]] = gpu.shuffle xor [[VAL_53]], [[VAL_10]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_55]], ^bb14, ^bb15
     // CHECK: ^bb14:
     // CHECK:   [[VAL_56:%.*]] = arith.cmpf ugt, [[VAL_53]], [[VAL_54]] : f32
@@ -88,19 +88,19 @@
     // CHECK: ^bb16([[VAL_58:%.*]]: f32):
     // CHECK:   br ^bb18([[VAL_58]] : f32)
     // CHECK: ^bb17:
-    // CHECK:   [[VAL_59:%.*]], [[VAL_60:%.*]] = gpu.shuffle [[VAL_0]], [[VAL_6]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_59:%.*]], [[VAL_60:%.*]] = gpu.shuffle xor [[VAL_0]], [[VAL_6]], [[VAL_5]] : f32
     // CHECK:   [[VAL_61:%.*]] = arith.cmpf ugt, [[VAL_0]], [[VAL_59]] : f32
     // CHECK:   [[VAL_62:%.*]] = select [[VAL_61]], [[VAL_0]], [[VAL_59]] : f32
-    // CHECK:   [[VAL_63:%.*]], [[VAL_64:%.*]] = gpu.shuffle [[VAL_62]], [[VAL_7]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_63:%.*]], [[VAL_64:%.*]] = gpu.shuffle xor [[VAL_62]], [[VAL_7]], [[VAL_5]] : f32
     // CHECK:   [[VAL_65:%.*]] = arith.cmpf ugt, [[VAL_62]], [[VAL_63]] : f32
     // CHECK:   [[VAL_66:%.*]] = select [[VAL_65]], [[VAL_62]], [[VAL_63]] : f32
-    // CHECK:   [[VAL_67:%.*]], [[VAL_68:%.*]] = gpu.shuffle [[VAL_66]], [[VAL_8]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_67:%.*]], [[VAL_68:%.*]] = gpu.shuffle xor [[VAL_66]], [[VAL_8]], [[VAL_5]] : f32
     // CHECK:   [[VAL_69:%.*]] = arith.cmpf ugt, [[VAL_66]], [[VAL_67]] : f32
     // CHECK:   [[VAL_70:%.*]] = select [[VAL_69]], [[VAL_66]], [[VAL_67]] : f32
-    // CHECK:   [[VAL_71:%.*]], [[VAL_72:%.*]] = gpu.shuffle [[VAL_70]], [[VAL_9]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_71:%.*]], [[VAL_72:%.*]] = gpu.shuffle xor [[VAL_70]], [[VAL_9]], [[VAL_5]] : f32
     // CHECK:   [[VAL_73:%.*]] = arith.cmpf ugt, [[VAL_70]], [[VAL_71]] : f32
     // CHECK:   [[VAL_74:%.*]] = select [[VAL_73]], [[VAL_70]], [[VAL_71]] : f32
-    // CHECK:   [[VAL_75:%.*]], [[VAL_76:%.*]] = gpu.shuffle [[VAL_74]], [[VAL_10]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_75:%.*]], [[VAL_76:%.*]] = gpu.shuffle xor [[VAL_74]], [[VAL_10]], [[VAL_5]] : f32
     // CHECK:   [[VAL_77:%.*]] = arith.cmpf ugt, [[VAL_74]], [[VAL_75]] : f32
     // CHECK:   [[VAL_78:%.*]] = select [[VAL_77]], [[VAL_74]], [[VAL_75]] : f32
     // CHECK:   br ^bb18([[VAL_78]] : f32)
@@ -125,7 +125,7 @@
     // CHECK:   [[VAL_87:%.*]] = arith.cmpi slt, [[VAL_83]], [[VAL_5]] : i32
     // CHECK:   cond_br [[VAL_87]], ^bb23, ^bb39
     // CHECK: ^bb23:
-    // CHECK:   [[VAL_88:%.*]], [[VAL_89:%.*]] = gpu.shuffle [[VAL_86]], [[VAL_6]], [[VAL_83]] xor : f32
+    // CHECK:   [[VAL_88:%.*]], [[VAL_89:%.*]] = gpu.shuffle xor [[VAL_86]], [[VAL_6]], [[VAL_83]] : f32
     // CHECK:   cond_br [[VAL_89]], ^bb24, ^bb25
     // CHECK: ^bb24:
     // CHECK:   [[VAL_90:%.*]] = arith.cmpf ugt, [[VAL_86]], [[VAL_88]] : f32
@@ -134,7 +134,7 @@
     // CHECK: ^bb25:
     // CHECK:   br ^bb26([[VAL_86]] : f32)
     // CHECK: ^bb26([[VAL_92:%.*]]: f32):
-    // CHECK:   [[VAL_93:%.*]], [[VAL_94:%.*]] = gpu.shuffle [[VAL_92]], [[VAL_7]], [[VAL_83]] xor : f32
+    // CHECK:   [[VAL_93:%.*]], [[VAL_94:%.*]] = gpu.shuffle xor [[VAL_92]], [[VAL_7]], [[VAL_83]] : f32
     // CHECK:   cond_br [[VAL_94]], ^bb27, ^bb28
     // CHECK: ^bb27:
     // CHECK:   [[VAL_95:%.*]] = arith.cmpf ugt, [[VAL_92]], [[VAL_93]] : f32
@@ -143,7 +143,7 @@
     // CHECK: ^bb28:
     // CHECK:   br ^bb29([[VAL_92]] : f32)
     // CHECK: ^bb29([[VAL_97:%.*]]: f32):
-    // CHECK:   [[VAL_98:%.*]], [[VAL_99:%.*]] = gpu.shuffle [[VAL_97]], [[VAL_8]], [[VAL_83]] xor : f32
+    // CHECK:   [[VAL_98:%.*]], [[VAL_99:%.*]] = gpu.shuffle xor [[VAL_97]], [[VAL_8]], [[VAL_83]] : f32
     // CHECK:   cond_br [[VAL_99]], ^bb30, ^bb31
     // CHECK: ^bb30:
     // CHECK:   [[VAL_100:%.*]] = arith.cmpf ugt, [[VAL_97]], [[VAL_98]] : f32
@@ -152,7 +152,7 @@
     // CHECK: ^bb31:
     // CHECK:   br ^bb32([[VAL_97]] : f32)
     // CHECK: ^bb32([[VAL_102:%.*]]: f32):
-    // CHECK:   [[VAL_103:%.*]], [[VAL_104:%.*]] = gpu.shuffle [[VAL_102]], [[VAL_9]], [[VAL_83]] xor : f32
+    // CHECK:   [[VAL_103:%.*]], [[VAL_104:%.*]] = gpu.shuffle xor [[VAL_102]], [[VAL_9]], [[VAL_83]] : f32
     // CHECK:   cond_br [[VAL_104]], ^bb33, ^bb34
     // CHECK: ^bb33:
     // CHECK:   [[VAL_105:%.*]] = arith.cmpf ugt, [[VAL_102]], [[VAL_103]] : f32
@@ -161,7 +161,7 @@
     // CHECK: ^bb34:
     // CHECK:   br ^bb35([[VAL_102]] : f32)
     // CHECK: ^bb35([[VAL_107:%.*]]: f32):
-    // CHECK:   [[VAL_108:%.*]], [[VAL_109:%.*]] = gpu.shuffle [[VAL_107]], [[VAL_10]], [[VAL_83]] xor : f32
+    // CHECK:   [[VAL_108:%.*]], [[VAL_109:%.*]] = gpu.shuffle xor [[VAL_107]], [[VAL_10]], [[VAL_83]] : f32
     // CHECK:   cond_br [[VAL_109]], ^bb36, ^bb37
     // CHECK: ^bb36:
     // CHECK:   [[VAL_110:%.*]] = arith.cmpf ugt, [[VAL_107]], [[VAL_108]] : f32
@@ -172,19 +172,19 @@
     // CHECK: ^bb38([[VAL_112:%.*]]: f32):
     // CHECK:   br ^bb40([[VAL_112]] : f32)
     // CHECK: ^bb39:
-    // CHECK:   [[VAL_113:%.*]], [[VAL_114:%.*]] = gpu.shuffle [[VAL_86]], [[VAL_6]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_113:%.*]], [[VAL_114:%.*]] = gpu.shuffle xor [[VAL_86]], [[VAL_6]], [[VAL_5]] : f32
     // CHECK:   [[VAL_115:%.*]] = arith.cmpf ugt, [[VAL_86]], [[VAL_113]] : f32
     // CHECK:   [[VAL_116:%.*]] = select [[VAL_115]], [[VAL_86]], [[VAL_113]] : f32
-    // CHECK:   [[VAL_117:%.*]], [[VAL_118:%.*]] = gpu.shuffle [[VAL_116]], [[VAL_7]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_117:%.*]], [[VAL_118:%.*]] = gpu.shuffle xor [[VAL_116]], [[VAL_7]], [[VAL_5]] : f32
     // CHECK:   [[VAL_119:%.*]] = arith.cmpf ugt, [[VAL_116]], [[VAL_117]] : f32
     // CHECK:   [[VAL_120:%.*]] = select [[VAL_119]], [[VAL_116]], [[VAL_117]] : f32
-    // CHECK:   [[VAL_121:%.*]], [[VAL_122:%.*]] = gpu.shuffle [[VAL_120]], [[VAL_8]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_121:%.*]], [[VAL_122:%.*]] = gpu.shuffle xor [[VAL_120]], [[VAL_8]], [[VAL_5]] : f32
     // CHECK:   [[VAL_123:%.*]] = arith.cmpf ugt, [[VAL_120]], [[VAL_121]] : f32
     // CHECK:   [[VAL_124:%.*]] = select [[VAL_123]], [[VAL_120]], [[VAL_121]] : f32
-    // CHECK:   [[VAL_125:%.*]], [[VAL_126:%.*]] = gpu.shuffle [[VAL_124]], [[VAL_9]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_125:%.*]], [[VAL_126:%.*]] = gpu.shuffle xor [[VAL_124]], [[VAL_9]], [[VAL_5]] : f32
     // CHECK:   [[VAL_127:%.*]] = arith.cmpf ugt, [[VAL_124]], [[VAL_125]] : f32
     // CHECK:   [[VAL_128:%.*]] = select [[VAL_127]], [[VAL_124]], [[VAL_125]] : f32
-    // CHECK:   [[VAL_129:%.*]], [[VAL_130:%.*]] = gpu.shuffle [[VAL_128]], [[VAL_10]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_129:%.*]], [[VAL_130:%.*]] = gpu.shuffle xor [[VAL_128]], [[VAL_10]], [[VAL_5]] : f32
     // CHECK:   [[VAL_131:%.*]] = arith.cmpf ugt, [[VAL_128]], [[VAL_129]] : f32
     // CHECK:   [[VAL_132:%.*]] = select [[VAL_131]], [[VAL_128]], [[VAL_129]] : f32
     // CHECK:   br ^bb40([[VAL_132]] : f32)
@@ -195,7 +195,7 @@
     // CHECK:   br ^bb42
     // CHECK: ^bb42:
     // CHECK:   gpu.barrier
-    %sum = "gpu.all_reduce"(%arg0) ({}) {op = "max"} : (f32) -> (f32)
+    %sum = gpu.all_reduce max %arg0 {} : (f32) -> (f32)
     gpu.return
   }
 
diff --git a/mlir/test/Dialect/GPU/all-reduce.mlir b/mlir/test/Dialect/GPU/all-reduce.mlir
--- a/mlir/test/Dialect/GPU/all-reduce.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce.mlir
@@ -16,17 +16,17 @@
     // CHECK-DAG:   [[VAL_8:%.*]] = arith.constant 4 : i32
     // CHECK-DAG:   [[VAL_9:%.*]] = arith.constant 8 : i32
     // CHECK-DAG:   [[VAL_10:%.*]] = arith.constant 16 : i32
-    // CHECK:   [[VAL_11:%.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
+    // CHECK:   [[VAL_11:%.*]] = gpu.block_dim x
     // CHECK:   [[VAL_12:%.*]] = arith.index_cast [[VAL_11]] : index to i32
-    // CHECK:   [[VAL_13:%.*]] = "gpu.block_dim"() {dimension = "y"} : () -> index
+    // CHECK:   [[VAL_13:%.*]] = gpu.block_dim y
     // CHECK:   [[VAL_14:%.*]] = arith.index_cast [[VAL_13]] : index to i32
-    // CHECK:   [[VAL_15:%.*]] = "gpu.block_dim"() {dimension = "z"} : () -> index
+    // CHECK:   [[VAL_15:%.*]] = gpu.block_dim z
     // CHECK:   [[VAL_16:%.*]] = arith.index_cast [[VAL_15]] : index to i32
-    // CHECK:   [[VAL_17:%.*]] = "gpu.thread_id"() {dimension = "x"} : () -> index
+    // CHECK:   [[VAL_17:%.*]] = gpu.thread_id x
     // CHECK:   [[VAL_18:%.*]] = arith.index_cast [[VAL_17]] : index to i32
-    // CHECK:   [[VAL_19:%.*]] = "gpu.thread_id"() {dimension = "y"} : () -> index
+    // CHECK:   [[VAL_19:%.*]] = gpu.thread_id y
     // CHECK:   [[VAL_20:%.*]] = arith.index_cast [[VAL_19]] : index to i32
-    // CHECK:   [[VAL_21:%.*]] = "gpu.thread_id"() {dimension = "z"} : () -> index
+    // CHECK:   [[VAL_21:%.*]] = gpu.thread_id z
     // CHECK:   [[VAL_22:%.*]] = arith.index_cast [[VAL_21]] : index to i32
     // CHECK:   [[VAL_23:%.*]] = arith.muli [[VAL_22]], [[VAL_14]] : i32
     // CHECK:   [[VAL_24:%.*]] = arith.addi [[VAL_23]], [[VAL_20]] : i32
@@ -41,7 +41,7 @@
     // CHECK:   [[VAL_33:%.*]] = arith.cmpi slt, [[VAL_32]], [[VAL_5]] : i32
     // CHECK:   cond_br [[VAL_33]], ^bb1, ^bb17
     // CHECK: ^bb1:
-    // CHECK:   [[VAL_34:%.*]], [[VAL_35:%.*]] = gpu.shuffle [[VAL_0]], [[VAL_6]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_34:%.*]], [[VAL_35:%.*]] = gpu.shuffle xor [[VAL_0]], [[VAL_6]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_35]], ^bb2, ^bb3
     // CHECK: ^bb2:
     // CHECK:   [[VAL_36:%.*]] = arith.addf [[VAL_0]], [[VAL_34]] : f32
@@ -49,7 +49,7 @@
     // CHECK: ^bb3:
     // CHECK:   br ^bb4([[VAL_0]] : f32)
     // CHECK: ^bb4([[VAL_37:%.*]]: f32):
-    // CHECK:   [[VAL_38:%.*]], [[VAL_39:%.*]] = gpu.shuffle [[VAL_37]], [[VAL_7]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_38:%.*]], [[VAL_39:%.*]] = gpu.shuffle xor [[VAL_37]], [[VAL_7]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_39]], ^bb5, ^bb6
     // CHECK: ^bb5:
     // CHECK:   [[VAL_40:%.*]] = arith.addf [[VAL_37]], [[VAL_38]] : f32
@@ -57,7 +57,7 @@
     // CHECK: ^bb6:
     // CHECK:   br ^bb7([[VAL_37]] : f32)
     // CHECK: ^bb7([[VAL_41:%.*]]: f32):
-    // CHECK:   [[VAL_42:%.*]], [[VAL_43:%.*]] = gpu.shuffle [[VAL_41]], [[VAL_8]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_42:%.*]], [[VAL_43:%.*]] = gpu.shuffle xor [[VAL_41]], [[VAL_8]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_43]], ^bb8, ^bb9
     // CHECK: ^bb8:
     // CHECK:   [[VAL_44:%.*]] = arith.addf [[VAL_41]], [[VAL_42]] : f32
@@ -65,7 +65,7 @@
     // CHECK: ^bb9:
     // CHECK:   br ^bb10([[VAL_41]] : f32)
     // CHECK: ^bb10([[VAL_45:%.*]]: f32):
-    // CHECK:   [[VAL_46:%.*]], [[VAL_47:%.*]] = gpu.shuffle [[VAL_45]], [[VAL_9]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_46:%.*]], [[VAL_47:%.*]] = gpu.shuffle xor [[VAL_45]], [[VAL_9]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_47]], ^bb11, ^bb12
     // CHECK: ^bb11:
     // CHECK:   [[VAL_48:%.*]] = arith.addf [[VAL_45]], [[VAL_46]] : f32
@@ -73,7 +73,7 @@
     // CHECK: ^bb12:
     // CHECK:   br ^bb13([[VAL_45]] : f32)
     // CHECK: ^bb13([[VAL_49:%.*]]: f32):
-    // CHECK:   [[VAL_50:%.*]], [[VAL_51:%.*]] = gpu.shuffle [[VAL_49]], [[VAL_10]], [[VAL_32]] xor : f32
+    // CHECK:   [[VAL_50:%.*]], [[VAL_51:%.*]] = gpu.shuffle xor [[VAL_49]], [[VAL_10]], [[VAL_32]] : f32
     // CHECK:   cond_br [[VAL_51]], ^bb14, ^bb15
     // CHECK: ^bb14:
     // CHECK:   [[VAL_52:%.*]] = arith.addf [[VAL_49]], [[VAL_50]] : f32
@@ -83,15 +83,15 @@
     // CHECK: ^bb16([[VAL_53:%.*]]: f32):
     // CHECK:   br ^bb18([[VAL_53]] : f32)
     // CHECK: ^bb17:
-    // CHECK:   [[VAL_54:%.*]], [[VAL_55:%.*]] = gpu.shuffle [[VAL_0]], [[VAL_6]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_54:%.*]], [[VAL_55:%.*]] = gpu.shuffle xor [[VAL_0]], [[VAL_6]], [[VAL_5]] : f32
     // CHECK:   [[VAL_56:%.*]] = arith.addf [[VAL_0]], [[VAL_54]] : f32
-    // CHECK:   [[VAL_57:%.*]], [[VAL_58:%.*]] = gpu.shuffle [[VAL_56]], [[VAL_7]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_57:%.*]], [[VAL_58:%.*]] = gpu.shuffle xor [[VAL_56]], [[VAL_7]], [[VAL_5]] : f32
     // CHECK:   [[VAL_59:%.*]] = arith.addf [[VAL_56]], [[VAL_57]] : f32
-    // CHECK:   [[VAL_60:%.*]], [[VAL_61:%.*]] = gpu.shuffle [[VAL_59]], [[VAL_8]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_60:%.*]], [[VAL_61:%.*]] = gpu.shuffle xor [[VAL_59]], [[VAL_8]], [[VAL_5]] : f32
     // CHECK:   [[VAL_62:%.*]] = arith.addf [[VAL_59]], [[VAL_60]] : f32
-    // CHECK:   [[VAL_63:%.*]], [[VAL_64:%.*]] = gpu.shuffle [[VAL_62]], [[VAL_9]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_63:%.*]], [[VAL_64:%.*]] = gpu.shuffle xor [[VAL_62]], [[VAL_9]], [[VAL_5]] : f32
     // CHECK:   [[VAL_65:%.*]] = arith.addf [[VAL_62]], [[VAL_63]] : f32
-    // CHECK:   [[VAL_66:%.*]], [[VAL_67:%.*]] = gpu.shuffle [[VAL_65]], [[VAL_10]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_66:%.*]], [[VAL_67:%.*]] = gpu.shuffle xor [[VAL_65]], [[VAL_10]], [[VAL_5]] : f32
     // CHECK:   [[VAL_68:%.*]] = arith.addf [[VAL_65]], [[VAL_66]] : f32
     // CHECK:   br ^bb18([[VAL_68]] : f32)
     // CHECK: ^bb18([[VAL_69:%.*]]: f32):
@@ -115,7 +115,7 @@
     // CHECK:   [[VAL_77:%.*]] = arith.cmpi slt, [[VAL_73]], [[VAL_5]] : i32
     // CHECK:   cond_br [[VAL_77]], ^bb23, ^bb39
     // CHECK: ^bb23:
-    // CHECK:   [[VAL_78:%.*]], [[VAL_79:%.*]] = gpu.shuffle [[VAL_76]], [[VAL_6]], [[VAL_73]] xor : f32
+    // CHECK:   [[VAL_78:%.*]], [[VAL_79:%.*]] = gpu.shuffle xor [[VAL_76]], [[VAL_6]], [[VAL_73]] : f32
     // CHECK:   cond_br [[VAL_79]], ^bb24, ^bb25
     // CHECK: ^bb24:
     // CHECK:   [[VAL_80:%.*]] = arith.addf [[VAL_76]], [[VAL_78]] : f32
@@ -123,7 +123,7 @@
     // CHECK: ^bb25:
     // CHECK:   br ^bb26([[VAL_76]] : f32)
     // CHECK: ^bb26([[VAL_81:%.*]]: f32):
-    // CHECK:   [[VAL_82:%.*]], [[VAL_83:%.*]] = gpu.shuffle [[VAL_81]], [[VAL_7]], [[VAL_73]] xor : f32
+    // CHECK:   [[VAL_82:%.*]], [[VAL_83:%.*]] = gpu.shuffle xor [[VAL_81]], [[VAL_7]], [[VAL_73]] : f32
     // CHECK:   cond_br [[VAL_83]], ^bb27, ^bb28
     // CHECK: ^bb27:
     // CHECK:   [[VAL_84:%.*]] = arith.addf [[VAL_81]], [[VAL_82]] : f32
@@ -131,7 +131,7 @@
     // CHECK: ^bb28:
     // CHECK:   br ^bb29([[VAL_81]] : f32)
     // CHECK: ^bb29([[VAL_85:%.*]]: f32):
-    // CHECK:   [[VAL_86:%.*]], [[VAL_87:%.*]] = gpu.shuffle [[VAL_85]], [[VAL_8]], [[VAL_73]] xor : f32
+    // CHECK:   [[VAL_86:%.*]], [[VAL_87:%.*]] = gpu.shuffle xor [[VAL_85]], [[VAL_8]], [[VAL_73]] : f32
     // CHECK:   cond_br [[VAL_87]], ^bb30, ^bb31
     // CHECK: ^bb30:
     // CHECK:   [[VAL_88:%.*]] = arith.addf [[VAL_85]], [[VAL_86]] : f32
@@ -139,7 +139,7 @@
     // CHECK: ^bb31:
     // CHECK:   br ^bb32([[VAL_85]] : f32)
     // CHECK: ^bb32([[VAL_89:%.*]]: f32):
-    // CHECK:   [[VAL_90:%.*]], [[VAL_91:%.*]] = gpu.shuffle [[VAL_89]], [[VAL_9]], [[VAL_73]] xor : f32
+    // CHECK:   [[VAL_90:%.*]], [[VAL_91:%.*]] = gpu.shuffle xor [[VAL_89]], [[VAL_9]], [[VAL_73]] : f32
     // CHECK:   cond_br [[VAL_91]], ^bb33, ^bb34
     // CHECK: ^bb33:
     // CHECK:   [[VAL_92:%.*]] = arith.addf [[VAL_89]], [[VAL_90]] : f32
@@ -147,7 +147,7 @@
     // CHECK: ^bb34:
     // CHECK:   br ^bb35([[VAL_89]] : f32)
     // CHECK: ^bb35([[VAL_93:%.*]]: f32):
-    // CHECK:   [[VAL_94:%.*]], [[VAL_95:%.*]] = gpu.shuffle [[VAL_93]], [[VAL_10]], [[VAL_73]] xor : f32
+    // CHECK:   [[VAL_94:%.*]], [[VAL_95:%.*]] = gpu.shuffle xor [[VAL_93]], [[VAL_10]], [[VAL_73]] : f32
     // CHECK:   cond_br [[VAL_95]], ^bb36, ^bb37
     // CHECK: ^bb36:
     // CHECK:   [[VAL_96:%.*]] = arith.addf [[VAL_93]], [[VAL_94]] : f32
@@ -157,15 +157,15 @@
     // CHECK: ^bb38([[VAL_97:%.*]]: f32):
     // CHECK:   br ^bb40([[VAL_97]] : f32)
     // CHECK: ^bb39:
-    // CHECK:   [[VAL_98:%.*]], [[VAL_99:%.*]] = gpu.shuffle [[VAL_76]], [[VAL_6]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_98:%.*]], [[VAL_99:%.*]] = gpu.shuffle xor [[VAL_76]], [[VAL_6]], [[VAL_5]] : f32
     // CHECK:   [[VAL_100:%.*]] = arith.addf [[VAL_76]], [[VAL_98]] : f32
-    // CHECK:   [[VAL_101:%.*]], [[VAL_102:%.*]] = gpu.shuffle [[VAL_100]], [[VAL_7]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_101:%.*]], [[VAL_102:%.*]] = gpu.shuffle xor [[VAL_100]], [[VAL_7]], [[VAL_5]] : f32
     // CHECK:   [[VAL_103:%.*]] = arith.addf [[VAL_100]], [[VAL_101]] : f32
-    // CHECK:   [[VAL_104:%.*]], [[VAL_105:%.*]] = gpu.shuffle [[VAL_103]], [[VAL_8]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_104:%.*]], [[VAL_105:%.*]] = gpu.shuffle xor [[VAL_103]], [[VAL_8]], [[VAL_5]] : f32
     // CHECK:   [[VAL_106:%.*]] = arith.addf [[VAL_103]], [[VAL_104]] : f32
-    // CHECK:   [[VAL_107:%.*]], [[VAL_108:%.*]] = gpu.shuffle [[VAL_106]], [[VAL_9]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_107:%.*]], [[VAL_108:%.*]] = gpu.shuffle xor [[VAL_106]], [[VAL_9]], [[VAL_5]] : f32
     // CHECK:   [[VAL_109:%.*]] = arith.addf [[VAL_106]], [[VAL_107]] : f32
-    // CHECK:   [[VAL_110:%.*]], [[VAL_111:%.*]] = gpu.shuffle [[VAL_109]], [[VAL_10]], [[VAL_5]] xor : f32
+    // CHECK:   [[VAL_110:%.*]], [[VAL_111:%.*]] = gpu.shuffle xor [[VAL_109]], [[VAL_10]], [[VAL_5]] : f32
     // CHECK:   [[VAL_112:%.*]] = arith.addf [[VAL_109]], [[VAL_110]] : f32
     // CHECK:   br ^bb40([[VAL_112]] : f32)
     // CHECK: ^bb40([[VAL_113:%.*]]: f32):
@@ -175,7 +175,7 @@
     // CHECK:   br ^bb42
     // CHECK: ^bb42:
     // CHECK:   gpu.barrier
-    %sum = "gpu.all_reduce"(%arg0) ({}) {op = "add"} : (f32) -> (f32)
+    %sum = gpu.all_reduce add %arg0 {} : (f32) -> (f32)
     gpu.return
   }
 
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -194,42 +194,6 @@
 
 // -----
 
-func @illegal_dimension() {
-  // expected-error@+1 {{dimension "o" is invalid}}
-  %tIdX = "gpu.thread_id"() {dimension = "o"} : () -> (index)
-
-  return
-}
-
-// -----
-
-func @illegal_dimension() {
-  // expected-error@+1 {{dimension "o" is invalid}}
-  %bDimX = "gpu.block_dim"() {dimension = "o"} : () -> (index)
-
-  return
-}
-
-// -----
-
-func @illegal_dimension() {
-  // expected-error@+1 {{dimension "o" is invalid}}
-  %bIdX = "gpu.block_id"() {dimension = "o"} : () -> (index)
-
-  return
-}
-
-// -----
-
-func @illegal_dimension() {
-  // expected-error@+1 {{dimension "o" is invalid}}
-  %gDimX = "gpu.grid_dim"() {dimension = "o"} : () -> (index)
-
-  return
-}
-
-// -----
-
 func @reduce_no_op_no_body(%arg0 : f32) {
   // expected-error@+1 {{expected either an op attribute or a non-empty body}}
   %res = "gpu.all_reduce"(%arg0) ({}) : (f32) -> (f32)
@@ -243,15 +207,15 @@
   %res = "gpu.all_reduce"(%arg0) ({
   ^bb(%lhs : f32, %rhs : f32):
     "gpu.yield"(%lhs) : (f32) -> ()
-  }) {op = "add"} : (f32) -> (f32)
+  }) {op = #gpu<"all_reduce_op add">} : (f32) -> (f32)
   return
 }
 
 // -----
 
 func @reduce_invalid_op(%arg0 : f32) {
-  // expected-error@+1 {{attribute 'op' failed to satisfy constraint}}
-  %res = "gpu.all_reduce"(%arg0) ({}) {op = "foo"} : (f32) -> (f32)
+  // expected-error@+1 {{invalid op kind}}
+  %res = gpu.all_reduce foo %arg0 {} : (f32) -> (f32)
   return
 }
 
@@ -259,7 +223,7 @@
 
 func @reduce_invalid_op_type(%arg0 : f32) {
   // expected-error@+1 {{`and` accumulator is only compatible with Integer type}}
-  %res = "gpu.all_reduce"(%arg0) ({}) {op = "and"} : (f32) -> (f32)
+  %res = gpu.all_reduce and %arg0 {} : (f32) -> (f32)
   return
 }
 
@@ -267,10 +231,10 @@
 
 func @reduce_incorrect_region_arguments(%arg0 : f32) {
   // expected-error@+1 {{expected two region arguments}}
-  %res = "gpu.all_reduce"(%arg0) ({
+  %res = gpu.all_reduce %arg0 {
   ^bb(%lhs : f32):
     "gpu.yield"(%lhs) : (f32) -> ()
-  }) : (f32) -> (f32)
+  } : (f32) -> (f32)
   return
 }
 
@@ -278,10 +242,10 @@
 
 func @reduce_incorrect_region_arguments(%arg0 : f32) {
   // expected-error@+1 {{incorrect region argument type}}
-  %res = "gpu.all_reduce"(%arg0) ({
+  %res = gpu.all_reduce %arg0 {
   ^bb(%lhs : f32, %rhs : i32):
     "gpu.yield"(%lhs) : (f32) -> ()
-  }) : (f32) -> (f32)
+  } : (f32) -> (f32)
   return
 }
 
@@ -289,10 +253,10 @@
 
 func @reduce_incorrect_yield(%arg0 : f32) {
   // expected-error@+1 {{expected one gpu.yield operand}}
-  %res = "gpu.all_reduce"(%arg0) ({
+  %res = gpu.all_reduce %arg0 {
   ^bb(%lhs : f32, %rhs : f32):
     "gpu.yield"(%lhs, %rhs) : (f32, f32) -> ()
-  }) : (f32) -> (f32)
+  } : (f32) -> (f32)
   return
 }
 
@@ -300,11 +264,11 @@
 
 func @reduce_incorrect_yield(%arg0 : f32) {
   // expected-error@+1 {{incorrect gpu.yield type}}
-  %res = "gpu.all_reduce"(%arg0) ({
+  %res = gpu.all_reduce %arg0 {
   ^bb(%lhs : f32, %rhs : f32):
     %one = arith.constant 1 : i32
     "gpu.yield"(%one) : (i32) -> ()
-  }) : (f32) -> (f32)
+  } : (f32) -> (f32)
   return
 }
 
@@ -312,25 +276,27 @@
 
 func @reduce_incorrect_yield(%arg0 : f32) {
   // expected-error@+1 {{expected gpu.yield op in region}}
-  %res = "gpu.all_reduce"(%arg0) ({
+  %res = gpu.all_reduce %arg0 {
   ^bb(%lhs : f32, %rhs : f32):
     return
-  }) : (f32) -> (f32)
+  } : (f32) -> (f32)
   return
 }
 
 // -----
 
 func @shuffle_mismatching_type(%arg0 : f32, %arg1 : i32, %arg2 : i32) {
-  // expected-error@+1 {{requires the same type for value operand and result}}
-  %shfl, %pred = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "xor" } : (f32, i32, i32) -> (i32, i1)
+  // expected-error@+1 {{inferred type(s) 'f32', 'i1' are incompatible with return type(s) of operation 'i32', 'i1'}}
+  %shfl, %pred = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = #gpu<"shuffle_mode xor"> } : (f32, i32, i32) -> (i32, i1)
+  return
 }
 
 // -----
 
 func @shuffle_unsupported_type(%arg0 : index, %arg1 : i32, %arg2 : i32) {
-  // expected-error@+1 {{requires value operand type to be f32 or i32}}
-  %shfl, %pred = gpu.shuffle %arg0, %arg1, %arg2 xor : index
+  // expected-error@+1 {{operand #0 must be i32 or f32}}
+  %shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : index
+  return
 }
 
 // -----
diff --git a/mlir/test/Dialect/GPU/multiple-all-reduce.mlir b/mlir/test/Dialect/GPU/multiple-all-reduce.mlir
--- a/mlir/test/Dialect/GPU/multiple-all-reduce.mlir
+++ b/mlir/test/Dialect/GPU/multiple-all-reduce.mlir
@@ -10,9 +10,9 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) {
     %val = memref.load %data[%bx, %tx] : memref<2x6xf32>
-    %reduced0 = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
+    %reduced0 = gpu.all_reduce add %val {} : (f32) -> (f32)
     memref.store %reduced0, %sum[%bx] : memref<2xf32>
-    %reduced1 = "gpu.all_reduce"(%val) ({}) { op = "mul" } : (f32) -> (f32)
+    %reduced1 = gpu.all_reduce mul %val {} : (f32) -> (f32)
     memref.store %reduced1, %mul[%bx] : memref<2xf32>
     gpu.terminator
   }
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -28,39 +28,39 @@
 
   gpu.module @kernels {
     gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) kernel {
-      %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
-      %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
-      %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+      %tIdX = gpu.thread_id x
+      %tIdY = gpu.thread_id y
+      %tIdZ = gpu.thread_id z
 
-      %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
-      %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
-      %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+      %bDimX = gpu.block_dim x
+      %bDimY = gpu.block_dim y
+      %bDimZ = gpu.block_dim z
 
-      %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
-      %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
-      %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+      %bIdX = gpu.block_id x
+      %bIdY = gpu.block_id y
+      %bIdZ = gpu.block_id z
 
-      %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
-      %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
-      %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+      %gDimX = gpu.grid_dim x
+      %gDimY = gpu.grid_dim y
+      %gDimZ = gpu.grid_dim z
 
       %sgId = gpu.subgroup_id : index
       %numSg = gpu.num_subgroups : index
       %SgSi = gpu.subgroup_size : index
 
       %one = arith.constant 1.0 : f32
-      %sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32)
+      %sum = gpu.all_reduce add %one {} : (f32) -> (f32)
 
       %width = arith.constant 7 : i32
       %offset = arith.constant 3 : i32
-      // CHECK: gpu.shuffle %{{.*}}, %{{.*}}, %{{.*}} xor : f32
-      %shfl, %pred = gpu.shuffle %arg0, %offset, %width xor : f32
-      // CHECK: gpu.shuffle %{{.*}}, %{{.*}}, %{{.*}} up : f32
-      %shfl1, %pred1 = gpu.shuffle %arg0, %offset, %width up : f32
-      // CHECK: gpu.shuffle %{{.*}}, %{{.*}}, %{{.*}} down : f32
-      %shfl2, %pred2 = gpu.shuffle %arg0, %offset, %width down : f32
-      // CHECK: gpu.shuffle %{{.*}}, %{{.*}}, %{{.*}} idx : f32
-      %shfl3, %pred3 = gpu.shuffle %arg0, %offset, %width idx : f32
+      // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32
+      %shfl, %pred = gpu.shuffle xor %arg0, %offset, %width : f32
+      // CHECK: gpu.shuffle up %{{.*}}, %{{.*}}, %{{.*}} : f32
+      %shfl1, %pred1 = gpu.shuffle up %arg0, %offset, %width : f32
+      // CHECK: gpu.shuffle down %{{.*}}, %{{.*}}, %{{.*}} : f32
+      %shfl2, %pred2 = gpu.shuffle down %arg0, %offset, %width : f32
+      // CHECK: gpu.shuffle idx %{{.*}}, %{{.*}}, %{{.*}} : f32
+      %shfl3, %pred3 = gpu.shuffle idx %arg0, %offset, %width : f32
 
       "gpu.barrier"() : () -> ()
 
@@ -234,10 +234,10 @@
     %0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
     // CHECK: gpu.subgroup_mma_load_matrix %[[wg]][%[[i]], %[[i]]] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
     %1 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
-    // CHECK: gpu.subgroup_mma_elementwise %{{.*}}, %{{.*}} {operation = "ADDF"} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
-    %2 = gpu.subgroup_mma_elementwise %1, %1 {operation = "ADDF"} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
-    // CHECK: gpu.subgroup_mma_elementwise %{{.*}}, %{{.*}} {operation = "MAXF"} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
-    %3 = gpu.subgroup_mma_elementwise %2, %1 {operation = "MAXF"} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
+    // CHECK: gpu.subgroup_mma_elementwise addf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
+    %2 = gpu.subgroup_mma_elementwise addf %1, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
+    // CHECK: gpu.subgroup_mma_elementwise maxf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
+    %3 = gpu.subgroup_mma_elementwise maxf %2, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
     return
   }
 }
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -41,18 +41,18 @@
 // CHECK-LABEL: gpu.module @launch_kernel
 // CHECK-NEXT: gpu.func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
-// CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
-// CHECK-NEXT: = "gpu.block_id"() {dimension = "y"} : () -> index
-// CHECK-NEXT: = "gpu.block_id"() {dimension = "z"} : () -> index
-// CHECK-NEXT: %[[TID:.*]] = "gpu.thread_id"() {dimension = "x"} : () -> index
-// CHECK-NEXT: = "gpu.thread_id"() {dimension = "y"} : () -> index
-// CHECK-NEXT: = "gpu.thread_id"() {dimension = "z"} : () -> index
-// CHECK-NEXT: = "gpu.grid_dim"() {dimension = "x"} : () -> index
-// CHECK-NEXT: = "gpu.grid_dim"() {dimension = "y"} : () -> index
-// CHECK-NEXT: = "gpu.grid_dim"() {dimension = "z"} : () -> index
-// CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
-// CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index
-// CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index
+// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
+// CHECK-NEXT: = gpu.block_id y
+// CHECK-NEXT: = gpu.block_id z
+// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x
+// CHECK-NEXT: = gpu.thread_id y
+// CHECK-NEXT: = gpu.thread_id z
+// CHECK-NEXT: = gpu.grid_dim x
+// CHECK-NEXT: = gpu.grid_dim y
+// CHECK-NEXT: = gpu.grid_dim z
+// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
+// CHECK-NEXT: = gpu.block_dim y
+// CHECK-NEXT: = gpu.block_dim z
 // CHECK-NEXT: br ^[[BLOCK:.*]]
 // CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
--- a/mlir/test/Dialect/GPU/promotion.mlir
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -11,12 +11,12 @@
     // CHECK-DAG: %[[c1:.*]] = arith.constant 1
     // CHECK-DAG: %[[c4:.*]] = arith.constant 4
     // CHECK-DAG: %[[c5:.*]] = arith.constant 5
-    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
-    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
-    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
-    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
-    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
-    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+    // CHECK-DAG: %[[tx:.*]] = gpu.thread_id x
+    // CHECK-DAG: %[[ty:.*]] = gpu.thread_id y
+    // CHECK-DAG: %[[tz:.*]] = gpu.thread_id z
+    // CHECK-DAG: %[[bdx:.*]] = gpu.block_dim x
+    // CHECK-DAG: %[[bdy:.*]] = gpu.block_dim y
+    // CHECK-DAG: %[[bdz:.*]] = gpu.block_dim z
 
     // Verify that loops for the copy are emitted. We only check the number of
     // loops here since their bounds are produced by mapLoopToProcessorIds,
@@ -65,12 +65,12 @@
     // CHECK-DAG: %[[c6:.*]] = arith.constant 6
     // CHECK-DAG: %[[c7:.*]] = arith.constant 7
     // CHECK-DAG: %[[c8:.*]] = arith.constant 8
-    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
-    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
-    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
-    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
-    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
-    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+    // CHECK-DAG: %[[tx:.*]] = gpu.thread_id x
+    // CHECK-DAG: %[[ty:.*]] = gpu.thread_id y
+    // CHECK-DAG: %[[tz:.*]] = gpu.thread_id z
+    // CHECK-DAG: %[[bdx:.*]] = gpu.block_dim x
+    // CHECK-DAG: %[[bdy:.*]] = gpu.block_dim y
+    // CHECK-DAG: %[[bdz:.*]] = gpu.block_dim z
 
     // Verify that loops for the copy are emitted.
     // CHECK: scf.for %[[i0:.*]] =
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -495,21 +495,21 @@
 
 func @nvvm_invalid_shfl_pred_1(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
   // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
-  %0 = nvvm.shfl.sync "bfly" %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> i32
+  %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> i32
 }
 
 // -----
 
 func @nvvm_invalid_shfl_pred_2(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
   // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
-  %0 = nvvm.shfl.sync "bfly" %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32)>
+  %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32)>
 }
 
 // -----
 
 func @nvvm_invalid_shfl_pred_3(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
   // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
-  %0 = nvvm.shfl.sync "bfly" %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i32)>
+  %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i32)>
 }
 
 // -----
@@ -1042,7 +1042,7 @@
 llvm.func @wmmaLoadOp_invalid_mem_space(%arg0: !llvm.ptr<i32, 5>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected source pointer in memory space 0, 1, 3}}
   %0 = nvvm.wmma.load %arg0, %arg1
-    {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (!llvm.ptr<i32, 5>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
   llvm.return
 }
@@ -1052,7 +1052,7 @@
 llvm.func @wmmaLoadOp_invalid_AOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
   %0 = nvvm.wmma.load %arg0, %arg1
-  {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+  {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
   : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
   llvm.return
 }
@@ -1062,7 +1062,7 @@
 llvm.func @wmmaLoadOp_invalid_BOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
  %0 = nvvm.wmma.load %arg0, %arg1
- {eltype = "f16", frag = "b", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+ {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
  : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
   llvm.return
@@ -1073,7 +1073,7 @@
 llvm.func @wmmaLoadOp_invalid_COp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 4 elements of type 'vector<2xf16>'}}
  %0 = nvvm.wmma.load %arg0, %arg1
-   {eltype = "f16", frag = "c", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+   {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
    : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
 
   llvm.return
@@ -1086,7 +1086,7 @@
                             %arg4: vector<2 x f16>, %arg5: vector<2 xf16>) {
   // expected-error@+1 {{'nvvm.wmma.store' op expected operands to be a source pointer in memory space 0, 1, 3}}
   nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
-    {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : !llvm.ptr<i32, 5>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
   llvm.return
 }
@@ -1105,7 +1105,7 @@
                         %arg18: vector<2 x f16>) {
   // expected-error@+1 {{'nvvm.wmma.mma' op expected 20 arguments}}
   %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18
-    {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f16>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
        vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)
       -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>
@@ -1126,7 +1126,7 @@
                         %arg18: vector<2 x f16>, %arg19: vector<2 x f16>) {
   // expected-error@+1 {{'nvvm.wmma.mma' op expected destination type is a structure of 4 elements of type 'vector<2xf16>'}}
   %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19
-    {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f16>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
        vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)
       -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>  llvm.return
@@ -1146,7 +1146,7 @@
                         %arg20: f32, %arg21: f32, %arg22: f32, %arg23: f32) {
   // expected-error@+1 {{'nvvm.wmma.mma' op expected argument 15 to be of type 'vector<2xf16>'}}
   %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23
-    {eltypeA = "f16", eltypeB = "f32", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f32>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return
 }
@@ -1165,7 +1165,7 @@
                         %arg20: f32, %arg21: f32, %arg22: f32, %arg23: vector<2xf16>) {
   // expected-error@+1 {{'nvvm.wmma.mma' op expected argument 23 to be of type 'f32'}}
   %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23
-    {eltypeA = "f16", eltypeB = "f32", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f32>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return
 }
@@ -1184,7 +1184,7 @@
                         %arg20: f32, %arg21: f32, %arg22: f32, %arg23: f32) {
   // expected-error@+1 {{'nvvm.wmma.mma' op expected destination type is a structure of 8 elements of type 'f32'}}
   %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23
-    {eltypeA = "f16", eltypeB = "f32", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f32>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, vector<2xf16>)>
   llvm.return
 }
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -37,26 +37,26 @@
 func @nvvm_shfl(
     %arg0 : i32, %arg1 : i32, %arg2 : i32,
     %arg3 : i32, %arg4 : f32) -> i32 {
-  // CHECK: nvvm.shfl.sync "bfly" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32 -> i32
-  %0 = nvvm.shfl.sync "bfly" %arg0, %arg3, %arg1, %arg2 : i32 -> i32
-  // CHECK: nvvm.shfl.sync "bfly" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
-  %1 = nvvm.shfl.sync "bfly" %arg0, %arg4, %arg1, %arg2 : f32 -> f32
-  // CHECK: nvvm.shfl.sync "up" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
-  %2 = nvvm.shfl.sync "up" %arg0, %arg4, %arg1, %arg2 : f32 -> f32
-  // CHECK: nvvm.shfl.sync "down" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
-  %3 = nvvm.shfl.sync "down" %arg0, %arg4, %arg1, %arg2 : f32 -> f32
-  // CHECK: nvvm.shfl.sync "idx" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
-  %4 = nvvm.shfl.sync "idx" %arg0, %arg4, %arg1, %arg2 : f32 -> f32
+  // CHECK: nvvm.shfl.sync bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32 -> i32
+  %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 : i32 -> i32
+  // CHECK: nvvm.shfl.sync bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
+  %1 = nvvm.shfl.sync bfly %arg0, %arg4, %arg1, %arg2 : f32 -> f32
+  // CHECK: nvvm.shfl.sync up %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
+  %2 = nvvm.shfl.sync up %arg0, %arg4, %arg1, %arg2 : f32 -> f32
+  // CHECK: nvvm.shfl.sync down %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
+  %3 = nvvm.shfl.sync down %arg0, %arg4, %arg1, %arg2 : f32 -> f32
+  // CHECK: nvvm.shfl.sync idx %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 -> f32
+  %4 = nvvm.shfl.sync idx %arg0, %arg4, %arg1, %arg2 : f32 -> f32
   llvm.return %0 : i32
 }
 
 func @nvvm_shfl_pred(
     %arg0 : i32, %arg1 : i32, %arg2 : i32,
     %arg3 : i32, %arg4 : f32) -> !llvm.struct<(i32, i1)> {
-  // CHECK: nvvm.shfl.sync "bfly" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
-  %0 = nvvm.shfl.sync "bfly" %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
-  // CHECK: nvvm.shfl.sync "bfly" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
-  %1 = nvvm.shfl.sync "bfly" %arg0, %arg4, %arg1, %arg2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+  // CHECK: nvvm.shfl.sync bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
+  %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
+  // CHECK: nvvm.shfl.sync bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+  %1 = nvvm.shfl.sync bfly %arg0, %arg4, %arg1, %arg2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
   llvm.return %0 : !llvm.struct<(i32, i1)>
 }
 
@@ -71,14 +71,14 @@
                %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32,
                %c4 : f32, %c5 : f32, %c6 : f32, %c7 : f32) {
   // CHECK: nvvm.mma.sync {{.*}} {alayout = "row", blayout = "col"} : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
-  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="row", blayout="col"} : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout = "row", blayout = "col"} : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return %0 : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
 }
 
 func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
-  // CHECK: nvvm.wmma.load {{.*}} {eltype = "tf32", frag = "a", k = 8 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+  // CHECK: nvvm.wmma.load {{.*}} {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
   %0 = nvvm.wmma.load %arg0, %arg1
-    {eltype = "tf32", frag = "a", k = 8 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
   llvm.return %0 : !llvm.struct<(i32, i32, i32, i32)>
 }
@@ -87,9 +87,9 @@
                     %6 : i32, %7 : i32, %8 : f32, %9 : f32, %10 : f32,
                     %11 : f32, %12 : f32, %13 : f32, %14 : f32, %15 : f32)
                    -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> {
-  // CHECK: nvvm.wmma.mma {{.*}} {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+  // CHECK: nvvm.wmma.mma {{.*}} {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
   %r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
-    {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
     -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return %r : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
diff --git a/mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir b/mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir
--- a/mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir
+++ b/mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir
@@ -25,13 +25,13 @@
 // CHECK-LABEL: func @distribute_for_gpu
 // CHECK:  %[[C64:.*]] = arith.constant 64 : index
 
-// CHECK-DAG:  %[[GPU_BLOCK_X:.*]] = "gpu.block_id"() {dimension = "x"}
-// CHECK-DAG:  %[[GPU_GRID_DIM_X:.*]] = "gpu.grid_dim"() {dimension = "x"}
+// CHECK-DAG:  %[[GPU_BLOCK_X:.*]] = gpu.block_id x
+// CHECK-DAG:  %[[GPU_GRID_DIM_X:.*]] = gpu.grid_dim x
 // CHECK-DAG:  %[[LB_I:.*]] = affine.apply #[[$MAP0]](){{\[}}%[[GPU_BLOCK_X]]]
 // CHECK-DAG:  %[[STEP_I:.*]] = affine.apply #[[$MAP0]](){{\[}}%[[GPU_GRID_DIM_X]]]
 
-// CHECK-DAG:  %[[GPU_BLOCK_Y:.*]] = "gpu.block_id"() {dimension = "y"}
-// CHECK-DAG:  %[[GPU_GRID_DIM_Y:.*]] = "gpu.grid_dim"() {dimension = "y"}
+// CHECK-DAG:  %[[GPU_BLOCK_Y:.*]] = gpu.block_id y
+// CHECK-DAG:  %[[GPU_GRID_DIM_Y:.*]] = gpu.grid_dim y
 // CHECK-DAG:  %[[LB_J:.*]] = affine.apply #[[$MAP1]](){{\[}}%[[GPU_BLOCK_Y]]]
 // CHECK-DAG:  %[[STEP_J:.*]] = affine.apply #[[$MAP1]](){{\[}}%[[GPU_GRID_DIM_Y]]]
 
diff --git a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
--- a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
@@ -12,8 +12,8 @@
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
+//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
 //      CHECK: scf.for %[[ARG3:.*]] =
 //      CHECK:   %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
 //      CHECK:   %[[SV1:.*]] = memref.subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
@@ -38,8 +38,8 @@
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
+//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
 //      CHECK: %[[ITERY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
 //      CHECK: %[[ITERX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
 //      CHECK: %[[INBOUNDSY:.*]] = arith.cmpi slt, %[[ITERY]], %{{.*}}
@@ -70,10 +70,10 @@
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-//  CHECK-DAG: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
-//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
-//  CHECK-DAG: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
+//  CHECK-DAG: %[[NBLOCKSY:.*]] = gpu.grid_dim y
+//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
+//  CHECK-DAG: %[[NBLOCKSX:.*]] = gpu.grid_dim x
 //      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
 //      CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
 //      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
@@ -99,8 +99,8 @@
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
+//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
 //      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
 //      CHECK: %[[INBOUNDS:.*]] = arith.cmpi slt, %[[LBX]], %{{.*}}
 //      CHECK: scf.if %[[INBOUNDS]]
@@ -128,9 +128,9 @@
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
-//  CHECK-DAG: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
+//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
+//  CHECK-DAG: %[[NBLOCKSX:.*]] = gpu.grid_dim x
 //      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
 //      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
 //      CHECK: %[[STEPX:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSX]]]
@@ -159,9 +159,9 @@
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-//  CHECK-DAG: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
-//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
+//  CHECK-DAG: %[[NBLOCKSY:.*]] = gpu.grid_dim y
+//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
 //      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
 //      CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
 //      CHECK: scf.parallel (%[[ARG3:.*]]) = (%[[LBY]]) to (%{{.*}}) step (%[[STEPY]])
@@ -186,10 +186,10 @@
     -> tensor<?x?xf32> {
 //  CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
 //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-//  CHECK-DAG: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
-//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
-//  CHECK-DAG: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
+//  CHECK-DAG: %[[NBLOCKSY:.*]] = gpu.grid_dim y
+//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
+//  CHECK-DAG: %[[NBLOCKSX:.*]] = gpu.grid_dim x
 //      CHECK: %[[MUL:.+]] = affine.apply #[[MULMAP]]()[%[[BIDY]], %[[C8]]]
 //      CHECK: %[[LBY:.+]] = affine.apply #[[ADDMAP]]()[%[[MUL]], %[[C0]]]
 //      CHECK: %[[STEPY:.+]] = affine.apply #[[MULMAP]]()[%[[NBLOCKSY]], %[[C8]]]
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -376,9 +376,9 @@
   acc.parallel private(%a: memref<10xf32>, %c: memref<10x10xf32>) firstprivate(%b: memref<10xf32>) {
   }
   acc.parallel {
-  } attributes {defaultAttr = "none"}
+  } attributes {defaultAttr = #acc<"defaultvalue none">}
   acc.parallel {
-  } attributes {defaultAttr = "present"}
+  } attributes {defaultAttr = #acc<"defaultvalue present">}
   acc.parallel {
   } attributes {asyncAttr}
   acc.parallel {
@@ -441,9 +441,9 @@
 // CHECK:      acc.parallel private([[ARGA]]: memref<10xf32>, [[ARGC]]: memref<10x10xf32>) firstprivate([[ARGB]]: memref<10xf32>) {
 // CHECK-NEXT: }
 // CHECK:      acc.parallel {
-// CHECK-NEXT: } attributes {defaultAttr = "none"}
+// CHECK-NEXT: } attributes {defaultAttr = #acc<"defaultvalue none">}
 // CHECK:      acc.parallel {
-// CHECK-NEXT: } attributes {defaultAttr = "present"}
+// CHECK-NEXT: } attributes {defaultAttr = #acc<"defaultvalue present">}
 // CHECK:      acc.parallel {
 // CHECK-NEXT: } attributes {asyncAttr}
 // CHECK:      acc.parallel {
@@ -482,11 +482,11 @@
   acc.data copyin(%b: memref<10xf32>) copyout(%c: memref<10x10xf32>) present(%a: memref<10xf32>) {
   }
   acc.data present(%a : memref<10xf32>) {
-  } attributes { defaultAttr = "none" }
+  } attributes { defaultAttr = #acc<"defaultvalue none"> }
   acc.data present(%a : memref<10xf32>) {
-  } attributes { defaultAttr = "present" }
+  } attributes { defaultAttr = #acc<"defaultvalue present"> }
   acc.data {
-  } attributes { defaultAttr = "none" }
+  } attributes { defaultAttr = #acc<"defaultvalue none"> }
   return
 }
 
@@ -519,11 +519,11 @@
 // CHECK:      acc.data copyin([[ARGB]] : memref<10xf32>) copyout([[ARGC]] : memref<10x10xf32>) present([[ARGA]] : memref<10xf32>) {
 // CHECK-NEXT: }
 // CHECK:      acc.data present([[ARGA]] : memref<10xf32>) {
-// CHECK-NEXT: } attributes {defaultAttr = "none"}
+// CHECK-NEXT: } attributes {defaultAttr = #acc<"defaultvalue none">}
 // CHECK:      acc.data present([[ARGA]] : memref<10xf32>) {
-// CHECK-NEXT: } attributes {defaultAttr = "present"}
+// CHECK-NEXT: } attributes {defaultAttr = #acc<"defaultvalue present">}
 // CHECK:      acc.data {
-// CHECK-NEXT: } attributes {defaultAttr = "none"}
+// CHECK-NEXT: } attributes {defaultAttr = #acc<"defaultvalue none">}
 
 // -----
 
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -155,7 +155,7 @@
 // -----
 
 func @order_value(%lb : index, %ub : index, %step : index) {
-  // expected-error @below {{attribute 'order_val' failed to satisfy constraint: OrderKind Clause}}
+  // expected-error @below {{invalid order kind}}
   omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) order(default) {
     omp.yield
   }
@@ -476,7 +476,7 @@
 
 func @omp_ordered3(%vec0 : i64) -> () {
   // expected-error @below {{ordered depend directive must be closely nested inside a worksharing-loop with ordered clause with parameter present}}
-  omp.ordered depend_type("dependsink") depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+  omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
   return
 }
 
@@ -485,7 +485,7 @@
 func @omp_ordered4(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> () {
   omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(0) {
     // expected-error @below {{ordered depend directive must be closely nested inside a worksharing-loop with ordered clause with parameter present}}
-    omp.ordered depend_type("dependsink") depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+    omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
 
     omp.yield
   }
@@ -496,7 +496,7 @@
 func @omp_ordered5(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec1 : i64) -> () {
   omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) {
     // expected-error @below {{number of variables in depend clause does not match number of iteration variables in the doacross loop}}
-    omp.ordered depend_type("dependsource") depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+    omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
 
     omp.yield
   }
@@ -514,7 +514,7 @@
 // -----
 
 func @omp_atomic_read2(%x: memref<i32>, %v: memref<i32>) {
-  // expected-error @below {{attribute 'memory_order' failed to satisfy constraint: MemoryOrderKind Clause}}
+  // expected-error @below {{invalid memory order kind}}
   omp.atomic.read %v = %x memory_order(xyz) : memref<i32>
   return
 }
@@ -602,7 +602,7 @@
 // -----
 
 func @omp_atomic_write6(%addr : memref<i32>, %val : i32) {
-  // expected-error @below {{attribute 'memory_order' failed to satisfy constraint: MemoryOrderKind Clause}}
+  // expected-error @below {{invalid memory order kind}}
   omp.atomic.write  %addr = %val memory_order(xyz) : memref<i32>, i32
   return
 }
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -59,7 +59,7 @@
   // CHECK: omp.parallel num_threads(%{{.*}} : si32) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%num_threads, %data_var, %data_var, %data_var, %data_var, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = dense<[0,1,1,1,1,1,1,1]>: vector<8xi32>, default_val = "defshared"} : (si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+    }) {operand_segment_sizes = dense<[0,1,1,1,1,1,1,1]>: vector<8xi32>, default_val = #omp<"clause_default defshared">} : (si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
@@ -77,7 +77,7 @@
     }) {operand_segment_sizes = dense<[1,1,1,1,1,1,0,0]> : vector<8xi32>} : (i1, si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
 
     omp.terminator
-  }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1]> : vector<8xi32>, proc_bind_val = "spread"} : (i1, si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1]> : vector<8xi32>, proc_bind_val = #omp<"procbindkind spread">} : (i1, si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>, %{{.*}} : memref<i32>) shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
@@ -160,28 +160,28 @@
   "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,1,1,0,0]> : vector<10xi32>, schedule_val = "Static"} :
+  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,1,1,0,0]> : vector<10xi32>, schedule_val = #omp<"schedulekind Static">} :
     (index, index, index, memref<i32>, i32) -> ()
 
   // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>, %{{.*}} = %{{.*}} : memref<i32>) schedule(static)
   "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %linear_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,2,2,0,0]> : vector<10xi32>, schedule_val = "Static"} :
+  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,2,2,0,0]> : vector<10xi32>, schedule_val = #omp<"schedulekind Static">} :
     (index, index, index, memref<i32>, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) collapse(3) ordered(2)
   "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %data_var, %data_var, %linear_var, %chunk_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1,0,1]> : vector<10xi32>, schedule_val = "Dynamic", collapse_val = 3, ordered_val = 2} :
+  }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1,0,1]> : vector<10xi32>, schedule_val = #omp<"schedulekind Dynamic">, collapse_val = 3, ordered_val = 2} :
     (index, index, index, memref<i32>, memref<i32>, memref<i32>, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>) schedule(auto) nowait
   "omp.wsloop" (%lb, %ub, %step, %data_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0,0]> : vector<10xi32>, nowait, schedule_val = "Auto"} :
+  }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0,0]> : vector<10xi32>, nowait, schedule_val = #omp<"schedulekind Auto">} :
     (index, index, index, memref<i32>) -> ()
 
   return
@@ -467,22 +467,22 @@
 
   omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) {
     // Only one DEPEND(SINK: vec) clause
-    // CHECK: omp.ordered depend_type("dependsink") depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-    omp.ordered depend_type("dependsink") depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+    // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
+    omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
 
-    // CHECK: omp.ordered depend_type("dependsource") depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-    omp.ordered depend_type("dependsource") depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+    // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
+    omp.ordered depend_type(dependsource) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
 
     omp.yield
   }
 
   omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(2) {
     // Multiple DEPEND(SINK: vec) clauses
-    // CHECK: omp.ordered depend_type("dependsink") depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {num_loops_val = 2 : i64}
-    omp.ordered depend_type("dependsink") depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+    // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+    omp.ordered depend_type(dependsink) depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
 
-    // CHECK: omp.ordered depend_type("dependsource") depend_vec(%{{.*}}, %{{.*}} : i64, i64) {num_loops_val = 2 : i64}
-    omp.ordered depend_type("dependsource") depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+    // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}}, %{{.*}} : i64, i64) {num_loops_val = 2 : i64}
+    omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
 
     omp.yield
   }
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -55,7 +55,7 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
     %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
-    %reduced = "gpu.all_reduce"(%val) ({}) { op = "and" } : (i32) -> (i32)
+    %reduced = gpu.all_reduce and %val {} : (i32) -> (i32)
     memref.store %reduced, %sum[%bx] : memref<2xi32>
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -55,7 +55,7 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
     %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
-    %reduced = "gpu.all_reduce"(%val) ({}) { op = "max" } : (i32) -> (i32)
+    %reduced = gpu.all_reduce max %val {} : (i32) -> (i32)
     memref.store %reduced, %sum[%bx] : memref<2xi32>
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -55,7 +55,7 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
     %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
-    %reduced = "gpu.all_reduce"(%val) ({}) { op = "min" } : (i32) -> (i32)
+    %reduced = gpu.all_reduce min %val {} : (i32) -> (i32)
     memref.store %reduced, %sum[%bx] : memref<2xi32>
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -28,7 +28,7 @@
     %idx = arith.addi %tx, %t2 : index
     %t3 = arith.index_cast %idx : index to i32
     %val = arith.sitofp %t3 : i32 to f32
-    %sum = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
+    %sum = gpu.all_reduce add %val {} : (f32) -> (f32)
     memref.store %sum, %dst[%tz, %ty, %tx] : memref<?x?x?xf32>
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -55,7 +55,7 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
     %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
-    %reduced = "gpu.all_reduce"(%val) ({}) { op = "or" } : (i32) -> (i32)
+    %reduced = gpu.all_reduce or %val {} : (i32) -> (i32)
     memref.store %reduced, %sum[%bx] : memref<2xi32>
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -20,11 +20,11 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
     %val = arith.index_cast %tx : index to i32
-    %xor = "gpu.all_reduce"(%val) ({
+    %xor = gpu.all_reduce %val {
     ^bb(%lhs : i32, %rhs : i32):
       %xor = arith.xori %lhs, %rhs : i32
       "gpu.yield"(%xor) : (i32) -> ()
-    }) : (i32) -> (i32)
+    } : (i32) -> (i32)
     %res = arith.sitofp %xor : i32 to f32
     memref.store %res, %dst[%tx] : memref<?xf32>
     gpu.terminator
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -55,7 +55,7 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
     %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
-    %reduced = "gpu.all_reduce"(%val) ({}) { op = "xor" } : (i32) -> (i32)
+    %reduced = gpu.all_reduce xor %val {} : (i32) -> (i32)
     memref.store %reduced, %sum[%bx] : memref<2xi32>
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
--- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -58,9 +58,9 @@
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
     %val = memref.load %data[%bx, %tx] : memref<2x6xf32>
-    %reduced0 = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
+    %reduced0 = gpu.all_reduce add %val {} : (f32) -> (f32)
     memref.store %reduced0, %sum[%bx] : memref<2xf32>
-    %reduced1 = "gpu.all_reduce"(%val) ({}) { op = "mul" } : (f32) -> (f32)
+    %reduced1 = gpu.all_reduce mul %val {} : (f32) -> (f32)
     memref.store %reduced1, %mul[%bx] : memref<2xf32>
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
--- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -23,7 +23,7 @@
     %val = arith.sitofp %t0 : i32 to f32
     %width = arith.index_cast %block_x : index to i32
     %offset = arith.constant 4 : i32
-    %shfl, %valid = gpu.shuffle %val, %offset, %width xor : f32
+    %shfl, %valid = gpu.shuffle xor %val, %offset, %width : f32
     cond_br %valid, ^bb1(%shfl : f32), ^bb0
   ^bb0:
     %m1 = arith.constant -1.0 : f32
diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir
--- a/mlir/test/Integration/GPU/ROCM/printf.mlir
+++ b/mlir/test/Integration/GPU/ROCM/printf.mlir
@@ -12,7 +12,7 @@
 module attributes {gpu.container_module} {
     gpu.module @kernels {
         gpu.func @hello() kernel {
-            %0 = "gpu.thread_id"() {dimension="x"} : () -> (index)
+            %0 = gpu.thread_id x
             gpu.printf "Hello from %d\n" %0 : index
             gpu.return
         }
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -42,21 +42,21 @@
     %0 : i32, %1 : i32, %2 : i32,
     %3 : i32, %4 : f32) -> i32 {
   // CHECK: call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %6 = nvvm.shfl.sync "bfly" %0, %3, %1, %2 : i32 -> i32
+  %6 = nvvm.shfl.sync bfly %0, %3, %1, %2 : i32 -> i32
   // CHECK: call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %7 = nvvm.shfl.sync "bfly" %0, %4, %1, %2 : f32 -> f32
+  %7 = nvvm.shfl.sync bfly %0, %4, %1, %2 : f32 -> f32
   // CHECK: call i32 @llvm.nvvm.shfl.sync.up.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %8 = nvvm.shfl.sync "up" %0, %3, %1, %2 : i32 -> i32
+  %8 = nvvm.shfl.sync up %0, %3, %1, %2 : i32 -> i32
   // CHECK: call float @llvm.nvvm.shfl.sync.up.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %9 = nvvm.shfl.sync "up" %0, %4, %1, %2 : f32 -> f32
+  %9 = nvvm.shfl.sync up %0, %4, %1, %2 : f32 -> f32
   // CHECK: call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %10 = nvvm.shfl.sync "down" %0, %3, %1, %2 : i32 -> i32
+  %10 = nvvm.shfl.sync down %0, %3, %1, %2 : i32 -> i32
   // CHECK: call float @llvm.nvvm.shfl.sync.down.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %11 = nvvm.shfl.sync "down" %0, %4, %1, %2 : f32 -> f32
+  %11 = nvvm.shfl.sync down %0, %4, %1, %2 : f32 -> f32
   // CHECK: call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %12 = nvvm.shfl.sync "idx" %0, %3, %1, %2 : i32 -> i32
+  %12 = nvvm.shfl.sync idx %0, %3, %1, %2 : i32 -> i32
   // CHECK: call float @llvm.nvvm.shfl.sync.idx.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %13 = nvvm.shfl.sync "idx" %0, %4, %1, %2 : f32 -> f32
+  %13 = nvvm.shfl.sync idx %0, %4, %1, %2 : f32 -> f32
   llvm.return %6 : i32
 }
 
@@ -64,21 +64,21 @@
     %0 : i32, %1 : i32, %2 : i32,
     %3 : i32, %4 : f32) -> !llvm.struct<(i32, i1)> {
   // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %6 = nvvm.shfl.sync "bfly" %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
+  %6 = nvvm.shfl.sync bfly %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
   // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %7 = nvvm.shfl.sync "bfly" %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+  %7 = nvvm.shfl.sync bfly %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
   // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %8 = nvvm.shfl.sync "up" %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
+  %8 = nvvm.shfl.sync up %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
   // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %9 = nvvm.shfl.sync "up" %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+  %9 = nvvm.shfl.sync up %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
   // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %10 = nvvm.shfl.sync "down" %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
+  %10 = nvvm.shfl.sync down %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
   // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %11 = nvvm.shfl.sync "down" %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+  %11 = nvvm.shfl.sync down %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
   // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %12 = nvvm.shfl.sync "idx" %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
+  %12 = nvvm.shfl.sync idx %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
   // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  %13 = nvvm.shfl.sync "idx" %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
+  %13 = nvvm.shfl.sync idx %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
   llvm.return %6 : !llvm.struct<(i32, i1)>
 }
 
@@ -102,7 +102,7 @@
 llvm.func @gpu_wmma_load_op(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
   // CHECK: call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3i32(i32 addrspace(3)* %{{.*}}, i32 %{{.*}})
   %0 = nvvm.wmma.load %arg0, %arg1
-    {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
   llvm.return
@@ -115,7 +115,7 @@
                             %arg4: vector<2 xf16>, %arg5: vector<2 x f16>) {
   // CHECK: call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3i32(i32 addrspace(3)* %{{.*}}, <2 x half> {{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, i32 %{{.*}})
   nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
-    {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : !llvm.ptr<i32, 3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
   llvm.return
 }
@@ -134,7 +134,7 @@
                         %arg18: vector<2 x f16>, %arg19: vector<2 x f16>) {
   // CHECK: call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
   %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19
-    {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f16>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
        vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
        vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
@@ -146,7 +146,7 @@
 llvm.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) {
   // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0i32(i32* %{{.*}}, i32 %{{.*}})
   %0 = nvvm.wmma.load %arg0, %arg1
-    {eltype = "tf32", frag = "a", k = 8 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
   llvm.return
 }
@@ -156,7 +156,7 @@
                     %11 : f32, %12 : f32, %13 : f32, %14 : f32, %15 : f32) {
   // CHECK: { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
   %r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
-    {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
     : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
     -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -663,14 +663,14 @@
     // CHECK: [[TMP2:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[ADDR]], i64 0, i64 0
     // CHECK: [[OMP_THREAD2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
     // CHECK: call void @__kmpc_doacross_wait(%struct.ident_t* @[[GLOB3]], i32 [[OMP_THREAD2]], i64* [[TMP2]])
-    omp.ordered depend_type("dependsink") depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+    omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
 
     // CHECK: [[TMP3:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[ADDR3]], i64 0, i64 0
     // CHECK: store i64 [[ARG0]], i64* [[TMP3]], align 8
     // CHECK: [[TMP4:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[ADDR3]], i64 0, i64 0
     // CHECK: [[OMP_THREAD4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB5:[0-9]+]])
     // CHECK: call void @__kmpc_doacross_post(%struct.ident_t* @[[GLOB5]], i32 [[OMP_THREAD4]], i64* [[TMP4]])
-    omp.ordered depend_type("dependsource") depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+    omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
 
     omp.yield
   }
@@ -690,7 +690,7 @@
     // CHECK: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[ADDR7]], i64 0, i64 0
     // CHECK: [[OMP_THREAD8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
     // CHECK: call void @__kmpc_doacross_wait(%struct.ident_t* @[[GLOB7]], i32 [[OMP_THREAD8]], i64* [[TMP10]])
-    omp.ordered depend_type("dependsink") depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+    omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
 
     // CHECK: [[TMP11:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[ADDR9]], i64 0, i64 0
     // CHECK: store i64 [[ARG0]], i64* [[TMP11]], align 8
@@ -699,7 +699,7 @@
     // CHECK: [[TMP13:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[ADDR9]], i64 0, i64 0
     // CHECK: [[OMP_THREAD10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9:[0-9]+]])
     // CHECK: call void @__kmpc_doacross_post(%struct.ident_t* @[[GLOB9]], i32 [[OMP_THREAD10]], i64* [[TMP13]])
-    omp.ordered depend_type("dependsource") depend_vec(%arg3, %arg4 : i64, i64) {num_loops_val = 2 : i64}
+    omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {num_loops_val = 2 : i64}
 
     omp.yield
   }
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp
@@ -19,21 +19,20 @@
 using namespace mlir;
 using namespace mlir::linalg;
 
-template <char dim>
+template <gpu::Dimension Dim>
 static linalg::ProcInfo getGpuBlockInfo(OpBuilder &b, Location loc) {
-  std::string d(1, dim);
-  StringAttr attr = b.getStringAttr(d);
-
   Type indexType = b.getIndexType();
-  ProcInfo procInfo = {b.create<gpu::BlockIdOp>(loc, indexType, attr),
-                       b.create<gpu::GridDimOp>(loc, indexType, attr)};
+  ProcInfo procInfo = {b.create<gpu::BlockIdOp>(loc, indexType, Dim),
+                       b.create<gpu::GridDimOp>(loc, indexType, Dim)};
   return procInfo;
 }
 
 static LinalgLoopDistributionOptions getDistributionOptions() {
   LinalgLoopDistributionOptions opts;
-  opts.procInfoMap.insert(std::make_pair("block_x", getGpuBlockInfo<'x'>));
-  opts.procInfoMap.insert(std::make_pair("block_y", getGpuBlockInfo<'y'>));
+  opts.procInfoMap.insert(
+      std::make_pair("block_x", getGpuBlockInfo<gpu::Dimension::x>));
+  opts.procInfoMap.insert(
+      std::make_pair("block_y", getGpuBlockInfo<gpu::Dimension::y>));
   return opts;
 }
 
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -370,12 +370,11 @@
 getGpuProcIds(OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
   size_t count = std::min<size_t>(3, parallelLoopRanges.size());
   SmallVector<ProcInfo, 2> procInfo(count);
-  const char *xyz[] = {"x", "y", "z"};
   Type indexType = b.getIndexType();
   for (unsigned i = 0; i < count; ++i) {
-    procInfo[count - 1 - i] = {
-        b.create<IdOp>(loc, indexType, b.getStringAttr(xyz[i])),
-        b.create<NProcsOp>(loc, indexType, b.getStringAttr(xyz[i]))};
+    gpu::Dimension dim = *gpu::symbolizeDimension(i);
+    procInfo[count - 1 - i] = {b.create<IdOp>(loc, indexType, dim),
+                               b.create<NProcsOp>(loc, indexType, dim)};
   }
   return procInfo;
 }
diff --git a/mlir/test/mlir-tblgen/directive-common.td b/mlir/test/mlir-tblgen/directive-common.td
--- a/mlir/test/mlir-tblgen/directive-common.td
+++ b/mlir/test/mlir-tblgen/directive-common.td
@@ -1,4 +1,4 @@
-// RUN: mlir-tblgen -gen-directive-decl -I %S/../../../llvm/include %s | FileCheck -match-full-lines %s
+// RUN: mlir-tblgen -gen-directive-decl -directives-dialect=TDL -I %S/../../../llvm/include %s | FileCheck -match-full-lines %s
 
 include "llvm/Frontend/Directive/DirectiveBase.td"
 
@@ -21,11 +21,12 @@
   ];
 }
 
-// CHECK: def AKindvala : StrEnumAttrCase<"vala">;
-// CHECK: def AKindvalb : StrEnumAttrCase<"valb">;
-// CHECK: def AKind: StrEnumAttr<
+// CHECK: def AKindvala : I32EnumAttrCase<"vala", 0>;
+// CHECK: def AKindvalb : I32EnumAttrCase<"valb", 1>;
+// CHECK: def AKind: I32EnumAttr<
 // CHECK:   "ClauseAKind",
 // CHECK:   "AKind Clause",
 // CHECK:   [AKindvala,AKindvalb]> {
 // CHECK:     let cppNamespace = "::mlir::tdl";
 // CHECK: }
+// CHECK: def AKindAttr : EnumAttr<TDL_Dialect, AKind, "akind">;
diff --git a/mlir/test/mlir-vulkan-runner/addf.mlir b/mlir/test/mlir-vulkan-runner/addf.mlir
--- a/mlir/test/mlir-vulkan-runner/addf.mlir
+++ b/mlir/test/mlir-vulkan-runner/addf.mlir
@@ -9,7 +9,7 @@
   gpu.module @kernels {
     gpu.func @kernel_add(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
       kernel attributes { spv.entry_point_abi = {local_size = dense<[1, 1, 1]>: vector<3xi32> }} {
-      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      %0 = gpu.block_id x
       %1 = memref.load %arg0[%0] : memref<8xf32>
       %2 = memref.load %arg1[%0] : memref<8xf32>
       %3 = arith.addf %1, %2 : f32
diff --git a/mlir/test/mlir-vulkan-runner/addi.mlir b/mlir/test/mlir-vulkan-runner/addi.mlir
--- a/mlir/test/mlir-vulkan-runner/addi.mlir
+++ b/mlir/test/mlir-vulkan-runner/addi.mlir
@@ -9,9 +9,9 @@
   gpu.module @kernels {
     gpu.func @kernel_addi(%arg0 : memref<8xi32>, %arg1 : memref<8x8xi32>, %arg2 : memref<8x8x8xi32>)
       kernel attributes { spv.entry_point_abi = {local_size = dense<[1, 1, 1]>: vector<3xi32>}} {
-      %x = "gpu.block_id"() {dimension = "x"} : () -> index
-      %y = "gpu.block_id"() {dimension = "y"} : () -> index
-      %z = "gpu.block_id"() {dimension = "z"} : () -> index
+      %x = gpu.block_id x
+      %y = gpu.block_id y
+      %z = gpu.block_id z
       %0 = memref.load %arg0[%x] : memref<8xi32>
       %1 = memref.load %arg1[%y, %x] : memref<8x8xi32>
       %2 = arith.addi %0, %1 : i32
diff --git a/mlir/test/mlir-vulkan-runner/addi8.mlir b/mlir/test/mlir-vulkan-runner/addi8.mlir
--- a/mlir/test/mlir-vulkan-runner/addi8.mlir
+++ b/mlir/test/mlir-vulkan-runner/addi8.mlir
@@ -9,9 +9,9 @@
   gpu.module @kernels {
     gpu.func @kernel_addi(%arg0 : memref<8xi8>, %arg1 : memref<8x8xi8>, %arg2 : memref<8x8x8xi32>)
       kernel attributes { spv.entry_point_abi = {local_size = dense<[1, 1, 1]>: vector<3xi32>}} {
-      %x = "gpu.block_id"() {dimension = "x"} : () -> index
-      %y = "gpu.block_id"() {dimension = "y"} : () -> index
-      %z = "gpu.block_id"() {dimension = "z"} : () -> index
+      %x = gpu.block_id x
+      %y = gpu.block_id y
+      %z = gpu.block_id z
       %0 = memref.load %arg0[%x] : memref<8xi8>
       %1 = memref.load %arg1[%y, %x] : memref<8x8xi8>
       %2 = arith.addi %0, %1 : i8
diff --git a/mlir/test/mlir-vulkan-runner/mulf.mlir b/mlir/test/mlir-vulkan-runner/mulf.mlir
--- a/mlir/test/mlir-vulkan-runner/mulf.mlir
+++ b/mlir/test/mlir-vulkan-runner/mulf.mlir
@@ -9,8 +9,8 @@
   gpu.module @kernels {
     gpu.func @kernel_mul(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>, %arg2 : memref<4x4xf32>)
       kernel attributes { spv.entry_point_abi = {local_size = dense<[1, 1, 1]>: vector<3xi32> }} {
-      %x = "gpu.block_id"() {dimension = "x"} : () -> index
-      %y = "gpu.block_id"() {dimension = "y"} : () -> index
+      %x = gpu.block_id x
+      %y = gpu.block_id y
       %1 = memref.load %arg0[%x, %y] : memref<4x4xf32>
       %2 = memref.load %arg1[%x, %y] : memref<4x4xf32>
       %3 = arith.mulf %1, %2 : f32
diff --git a/mlir/test/mlir-vulkan-runner/subf.mlir b/mlir/test/mlir-vulkan-runner/subf.mlir
--- a/mlir/test/mlir-vulkan-runner/subf.mlir
+++ b/mlir/test/mlir-vulkan-runner/subf.mlir
@@ -9,9 +9,9 @@
   gpu.module @kernels {
     gpu.func @kernel_sub(%arg0 : memref<8x4x4xf32>, %arg1 : memref<4x4xf32>, %arg2 : memref<8x4x4xf32>)
       kernel attributes { spv.entry_point_abi = {local_size = dense<[1, 1, 1]>: vector<3xi32> }} {
-      %x = "gpu.block_id"() {dimension = "x"} : () -> index
-      %y = "gpu.block_id"() {dimension = "y"} : () -> index
-      %z = "gpu.block_id"() {dimension = "z"} : () -> index
+      %x = gpu.block_id x
+      %y = gpu.block_id y
+      %z = gpu.block_id z
       %1 = memref.load %arg0[%x, %y, %z] : memref<8x4x4xf32>
       %2 = memref.load %arg1[%y, %z] : memref<4x4xf32>
       %3 = arith.subf %1, %2 : f32
diff --git a/mlir/test/mlir-vulkan-runner/time.mlir b/mlir/test/mlir-vulkan-runner/time.mlir
--- a/mlir/test/mlir-vulkan-runner/time.mlir
+++ b/mlir/test/mlir-vulkan-runner/time.mlir
@@ -12,8 +12,8 @@
   gpu.module @kernels {
     gpu.func @kernel_add(%arg0 : memref<16384xf32>, %arg1 : memref<16384xf32>, %arg2 : memref<16384xf32>)
       kernel attributes { spv.entry_point_abi = {local_size = dense<[128, 1, 1]>: vector<3xi32> }} {
-      %bid = "gpu.block_id"() {dimension = "x"} : () -> index
-      %tid = "gpu.thread_id"() {dimension = "x"} : () -> index
+      %bid = gpu.block_id x
+      %tid = gpu.thread_id x
       %cst = arith.constant 128 : index
       %b = arith.muli %bid, %cst : index
       %0 = arith.addi %b, %tid : index
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -731,7 +731,7 @@
     if (parseResult.hasValue())
       return attr;
   }
-  parser.emitError(typeLoc) << "unknown  attribute `"
+  parser.emitError(typeLoc) << "unknown attribute `"
       << attrTag << "` in dialect `" << getNamespace() << "`";
   return {{};
 }
diff --git a/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp b/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp
--- a/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp
+++ b/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp
@@ -14,8 +14,10 @@
 #include "mlir/TableGen/GenInfo.h"
 
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/DirectiveEmitter.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
 using llvm::Clause;
@@ -39,7 +41,14 @@
 // Clause record in OMP.td. This name can be used to specify the type of the
 // OpenMP operation's operand. The allowedClauseValues field provides the list
 // of ClauseValues which are part of the enumeration.
-static bool emitDecls(const RecordKeeper &recordKeeper, raw_ostream &os) {
+static bool emitDecls(const RecordKeeper &recordKeeper, llvm::StringRef dialect,
+                      raw_ostream &os) {
+  // A dialect must be selected for the generated attributes.
+  if (dialect.empty()) {
+    llvm::PrintFatalError("a dialect must be selected for the directives via "
+                          "'--directives-dialect'");
+  }
+
   const auto &directiveLanguages =
       recordKeeper.getAllDerivedDefinitions("DirectiveLanguage");
   assert(!directiveLanguages.empty() && "DirectiveLanguage missing.");
@@ -56,18 +65,19 @@
     assert(!enumName.empty() && "enumClauseValue field not set.");
 
     std::vector<std::string> cvDefs;
-    for (const auto &cv : clauseVals) {
-      ClauseVal cval{cv};
+    for (const auto &it : llvm::enumerate(clauseVals)) {
+      ClauseVal cval{it.value()};
       if (!cval.isUserVisible())
         continue;
 
       const auto name = cval.getFormattedName();
       std::string cvDef{(enumName + llvm::Twine(name)).str()};
-      os << "def " << cvDef << " : StrEnumAttrCase<\"" << name << "\">;\n";
+      os << "def " << cvDef << " : I32EnumAttrCase<\"" << name << "\", "
+         << it.index() << ">;\n";
       cvDefs.push_back(cvDef);
     }
 
-    os << "def " << enumName << ": StrEnumAttr<\n";
+    os << "def " << enumName << ": I32EnumAttr<\n";
     os << "  \"Clause" << enumName << "\",\n";
     os << "  \"" << enumName << " Clause\",\n";
     os << "  [";
@@ -79,15 +89,27 @@
     os << "]> {\n";
     os << "    let cppNamespace = \"::mlir::"
        << directiveLanguages[0]->getValueAsString("cppNamespace") << "\";\n";
+    os << "    let genSpecializedAttr = 0;\n";
     os << "}\n";
+    llvm::SmallString<16> mnemonic;
+    llvm::transform(enumName, std::back_inserter(mnemonic), llvm::toLower);
+    os << "def " << enumName << "Attr : EnumAttr<" << dialect << "_Dialect, "
+       << enumName << ", \"" << mnemonic << "\">;\n";
   }
   return false;
 }
 
+static llvm::cl::OptionCategory
+    directiveGenCat("Options for gen-directive-decl");
+static llvm::cl::opt<std::string>
+    dialect("directives-dialect",
+            llvm::cl::desc("Generate directives for this dialect"),
+            llvm::cl::cat(directiveGenCat), llvm::cl::CommaSeparated);
+
 // Registers the generator to mlir-tblgen.
 static mlir::GenRegistration genDirectiveDecls(
     "gen-directive-decl",
     "Generate declarations for directives (OpenMP/OpenACC etc.)",
     [](const RecordKeeper &records, raw_ostream &os) {
-      return emitDecls(records, os);
+      return emitDecls(records, dialect, os);
     });
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -424,6 +424,14 @@
     Optional<StringRef> variableTransformer;
   };
 
+  /// The context in which an element is generated.
+  enum class GenContext {
+    /// The element is generated at the top-level or with the same behaviour.
+    Normal,
+    /// The element is generated inside an optional group.
+    Optional
+  };
+
   OperationFormat(const Operator &op)
       : allOperands(false), allOperandTypes(false), allResultTypes(false),
         infersResultTypes(false) {
@@ -442,7 +450,8 @@
   void genParser(Operator &op, OpClass &opClass);
   /// Generate the parser code for a specific format element.
   void genElementParser(Element *element, MethodBody &body,
-                        FmtContext &attrTypeCtx);
+                        FmtContext &attrTypeCtx,
+                        GenContext genCtx = GenContext::Normal);
   /// Generate the C++ to resolve the types of operands and results during
   /// parsing.
   void genParserTypeResolution(Operator &op, MethodBody &body);
@@ -1217,7 +1226,8 @@
 }
 
 void OperationFormat::genElementParser(Element *element, MethodBody &body,
-                                       FmtContext &attrTypeCtx) {
+                                       FmtContext &attrTypeCtx,
+                                       GenContext genCtx) {
   /// Optional Group.
   if (auto *optional = dyn_cast<OptionalElement>(element)) {
     auto elements = llvm::drop_begin(optional->getThenElements(),
@@ -1264,10 +1274,13 @@
            << "\", parser.getBuilder().getUnitAttr());\n";
     }
 
-    // Generate the rest of the elements normally.
+    // Generate the rest of the elements inside an optional group. Elements in
+    // an optional group after the guard are parsed as required.
     for (Element &childElement : llvm::drop_begin(elements, 1)) {
-      if (&childElement != elidedAnchorElement)
-        genElementParser(&childElement, body, attrTypeCtx);
+      if (&childElement != elidedAnchorElement) {
+        genElementParser(&childElement, body, attrTypeCtx,
+                         GenContext::Optional);
+      }
     }
     body << "  }";
 
@@ -1316,7 +1329,7 @@
     } else {
       attrTypeStr = "::mlir::Type{}";
     }
-    if (var->attr.isOptional()) {
+    if (genCtx == GenContext::Normal && var->attr.isOptional()) {
       body << formatv(optionalAttrParserCode, var->name, attrTypeStr);
     } else {
       if (attr->shouldBeQualified() ||
@@ -2048,9 +2061,6 @@
     if (attr->getTypeBuilder())
       body << "  _odsPrinter.printAttributeWithoutType("
            << op.getGetterName(var->name) << "Attr());\n";
-    else if (var->attr.isOptional())
-      body << "_odsPrinter.printAttribute(" << op.getGetterName(var->name)
-           << "Attr());\n";
     else if (attr->shouldBeQualified() ||
              var->attr.getStorageType() == "::mlir::Attribute")
       body << "  _odsPrinter.printAttribute(" << op.getGetterName(var->name)
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3002,6 +3002,14 @@
             ["-gen-enum-defs"],
             "include/mlir/Dialect/GPU/GPUOpsEnums.cpp.inc",
         ),
+        (
+            ["-gen-attrdef-decls"],
+            "include/mlir/Dialect/GPU/GPUOpsAttributes.h.inc",
+        ),
+        (
+            ["-gen-attrdef-defs"],
+            "include/mlir/Dialect/GPU/GPUOpsAttributes.cpp.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/GPU/GPUOps.td",
@@ -3541,6 +3549,20 @@
             ["-gen-enum-defs"],
             "include/mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc",
         ),
+        (
+            [
+                "-gen-attrdef-decls",
+                "-attrdefs-dialect=nvvm",
+            ],
+            "include/mlir/Dialect/LLVMIR/NVVMOpsAttributes.h.inc",
+        ),
+        (
+            [
+                "-gen-attrdef-defs",
+                "-attrdefs-dialect=nvvm",
+            ],
+            "include/mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td",
@@ -5884,7 +5906,10 @@
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            ["-gen-directive-decl"],
+            [
+                "-gen-directive-decl",
+                "-directives-dialect=OpenACC",
+            ],
             "include/mlir/Dialect/OpenACC/AccCommon.td",
         ),
     ],
@@ -5937,6 +5962,20 @@
             ["-gen-enum-defs"],
             "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.cpp.inc",
         ),
+        (
+            [
+                "-gen-attrdef-decls",
+                "-attrdefs-dialect=acc",
+            ],
+            "include/mlir/Dialect/OpenACC/OpenACCOpsAttributes.h.inc",
+        ),
+        (
+            [
+                "-gen-attrdef-defs",
+                "-attrdefs-dialect=acc",
+            ],
+            "include/mlir/Dialect/OpenACC/OpenACCOpsAttributes.cpp.inc",
+        ),
         (
             ["-gen-op-doc"],
             "g3doc/Dialects/OpenACC/OpenACCOps.md",
@@ -5978,7 +6017,10 @@
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            ["-gen-directive-decl"],
+            [
+                "-gen-directive-decl",
+                "-directives-dialect=OpenMP",
+            ],
             "include/mlir/Dialect/OpenMP/OmpCommon.td",
         ),
     ],
@@ -6045,6 +6087,20 @@
             ["-gen-type-interface-defs"],
             "include/mlir/Dialect/OpenMP/OpenMPTypeInterfaces.cpp.inc",
         ),
+        (
+            [
+                "-gen-attrdef-decls",
+                "-attrdefs-dialect=omp",
+            ],
+            "include/mlir/Dialect/OpenMP/OpenMPOpsAttributes.h.inc",
+        ),
+        (
+            [
+                "-gen-attrdef-defs",
+                "-attrdefs-dialect=omp",
+            ],
+            "include/mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc",
+        ),
         (
             ["-gen-op-doc"],
             "g3doc/Dialects/OpenMP/OpenMPOps.md",