diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -152,7 +152,7 @@
   }
   // gfx9 has no to a very limited support for floating-point min and max.
   if (chipset.majorVersion == 9) {
-    if (chipset.minorVersion >= 0x0a) {
+    if (chipset.minorVersion >= 0x0a && chipset.minorVersion != 0x41) {
       // gfx90a supports f64 max (and min, but we don't have a min wrapper right
       // now) but all other types need to be emulated.
       target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(
@@ -162,10 +162,18 @@
     } else {
       target.addIllegalOp<RawBufferAtomicFmaxOp>();
     }
+    if (chipset.minorVersion == 0x41) {
+      // gfx941 requires non-CAS atomics to be implemented with CAS loops.
+      // The workaround here mirrors HIP and OpenMP.
+      target.addIllegalOp<RawBufferAtomicFaddOp, RawBufferAtomicFmaxOp,
+                          RawBufferAtomicSmaxOp, RawBufferAtomicUminOp>();
+    }
   }
   patterns
       .add<RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
-           RawBufferAtomicByCasPattern<RawBufferAtomicFmaxOp, arith::MaxFOp>>(
+           RawBufferAtomicByCasPattern<RawBufferAtomicFmaxOp, arith::MaxFOp>,
+           RawBufferAtomicByCasPattern<RawBufferAtomicSmaxOp, arith::MaxSIOp>,
+           RawBufferAtomicByCasPattern<RawBufferAtomicUminOp, arith::MinUIOp>>(
           patterns.getContext());
 }