diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -55,7 +55,7 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { %val = memref.load %data[%bx, %tx] : memref<2x6xi32> - %reduced = gpu.all_reduce and %val {} : (i32) -> (i32) + %reduced = gpu.all_reduce and %val uniform {} : (i32) -> (i32) memref.store %reduced, %sum[%bx] : memref<2xi32> gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir @@ -55,7 +55,7 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { %val = memref.load %data[%bx, %tx] : memref<2x6xi32> - %reduced = gpu.all_reduce max %val {} : (i32) -> (i32) + %reduced = gpu.all_reduce max %val uniform {} : (i32) -> (i32) memref.store %reduced, %sum[%bx] : memref<2xi32> gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir @@ -55,7 +55,7 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { %val = memref.load %data[%bx, %tx] : memref<2x6xi32> - %reduced = gpu.all_reduce min %val {} : (i32) -> (i32) + %reduced = gpu.all_reduce min %val uniform {} : (i32) -> (i32) memref.store %reduced, %sum[%bx] : memref<2xi32> gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -28,7 +28,7 @@ %idx = arith.addi %tx, %t2 : index %t3 = arith.index_cast %idx : index to i32 %val = arith.sitofp %t3 : i32 to f32 - %sum = gpu.all_reduce add %val {} : (f32) -> (f32) + %sum = gpu.all_reduce add %val uniform {} : (f32) -> (f32) memref.store %sum, %dst[%tz, %ty, %tx] : memref gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -55,7 +55,7 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { %val = memref.load %data[%bx, %tx] : memref<2x6xi32> - %reduced = gpu.all_reduce or %val {} : (i32) -> (i32) + %reduced = gpu.all_reduce or %val uniform {} : (i32) -> (i32) memref.store %reduced, %sum[%bx] : memref<2xi32> gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -20,7 +20,7 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %val = arith.index_cast %tx : index to i32 - %xor = gpu.all_reduce %val { + %xor = gpu.all_reduce %val uniform { ^bb(%lhs : i32, %rhs : i32): %xor = arith.xori %lhs, %rhs : i32 "gpu.yield"(%xor) : (i32) -> () diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -55,7 +55,7 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { %val = memref.load %data[%bx, %tx] : memref<2x6xi32> - %reduced = gpu.all_reduce xor %val {} : (i32) -> (i32) + %reduced = gpu.all_reduce xor %val uniform {} : (i32) -> (i32) memref.store %reduced, %sum[%bx] : memref<2xi32> gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -58,9 +58,9 @@ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { %val = memref.load %data[%bx, %tx] : memref<2x6xf32> - %reduced0 = gpu.all_reduce add %val {} : (f32) -> (f32) + %reduced0 = gpu.all_reduce add %val uniform {} : (f32) -> (f32) memref.store %reduced0, %sum[%bx] : memref<2xf32> - %reduced1 = gpu.all_reduce mul %val {} : (f32) -> (f32) + %reduced1 = gpu.all_reduce mul %val uniform {} : (f32) -> (f32) memref.store %reduced1, %mul[%bx] : memref<2xf32> gpu.terminator }