diff --git a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir @@ -25,9 +25,9 @@ %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir @@ -25,9 +25,9 @@ %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir @@ -25,9 +25,9 @@ %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir @@ -11,7 +11,7 @@ %sy = dim %dst, %c1 : memref %sz = dim %dst, %c0 : memref %cast_dst = memref_cast %dst : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz) { %t0 = muli %tz, %block_y : index @@ -28,5 +28,5 @@ return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir @@ -25,9 +25,9 @@ %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir @@ -8,7 +8,7 @@ %c0 = constant 0 : index %sx = dim %dst, %c0 : memref %cast_dst = memref_cast %dst : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %val = index_cast %tx : index to i32 @@ -25,5 +25,5 @@ return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir @@ -25,9 +25,9 @@ %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir --- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir +++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir @@ -18,7 +18,7 @@ %21 = constant 5 : i32 %22 = memref_cast %arg0 : memref<5xf32> to memref %23 = memref_cast %22 : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%23) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%23) : (memref<*xf32>) -> () call @print_memref_f32(%23) : (memref<*xf32>) -> () %24 = constant 1.0 : f32 call @other_func(%24, %22) : (f32, memref) -> () @@ -26,5 +26,5 @@ return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir --- a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir +++ b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir @@ -26,11 +26,11 @@ %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xf32> to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_data) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_data) : (memref<*xf32>) -> () %cast_sum = memref_cast %sum : memref<2xf32> to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_sum) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_sum) : (memref<*xf32>) -> () %cast_mul = memref_cast %mul : memref<2xf32> to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_mul) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_mul) : (memref<*xf32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xf32> store %cst1, %data[%c0, %c1] : memref<2x6xf32> @@ -66,5 +66,5 @@ return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir --- a/mlir/test/mlir-cuda-runner/shuffle.mlir +++ b/mlir/test/mlir-cuda-runner/shuffle.mlir @@ -8,7 +8,7 @@ %c0 = constant 0 : index %sx = dim %dst, %c0 : memref %cast_dest = memref_cast %dst : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_dest) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_dest) : (memref<*xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %t0 = index_cast %tx : index to i32 @@ -28,5 +28,5 @@ return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/two-modules.mlir b/mlir/test/mlir-cuda-runner/two-modules.mlir --- a/mlir/test/mlir-cuda-runner/two-modules.mlir +++ b/mlir/test/mlir-cuda-runner/two-modules.mlir @@ -8,7 +8,7 @@ %c0 = constant 0 : index %sx = dim %dst, %c0 : memref %cast_dst = memref_cast %dst : memref to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %t0 = index_cast %tx : index to i32 @@ -25,5 +25,5 @@ return } -func @mcuMemHostRegisterInt32(%memref : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%memref : memref<*xi32>) func @print_memref_i32(%memref : memref<*xi32>) diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -83,7 +83,7 @@ // Allows to register a MemRef with the CUDA runtime. Initializes array with // value. Helpful until we have transfer functions implemented. template -void mcuMemHostRegisterMemRef(const DynamicMemRefType &mem_ref, T value) { +void mgpuMemHostRegisterMemRef(const DynamicMemRefType &mem_ref, T value) { llvm::SmallVector denseStrides(mem_ref.rank); llvm::ArrayRef sizes(mem_ref.sizes, mem_ref.rank); llvm::ArrayRef strides(mem_ref.strides, mem_ref.rank); @@ -103,12 +103,12 @@ mgpuMemHostRegister(pointer, count * sizeof(T)); } -extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) { +extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) { UnrankedMemRefType mem_ref = {rank, ptr}; - mcuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 1.23f); + mgpuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 1.23f); } -extern "C" void mcuMemHostRegisterInt32(int64_t rank, void *ptr) { +extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) { UnrankedMemRefType mem_ref = {rank, ptr}; - mcuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 123); + mgpuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 123); }