diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -1253,6 +1253,139 @@ }]; } +// GCN raw buffer load +def GPU_GCNRawBufferLoadOp : + GPU_Op<"gcn_raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>, + AttrSizedOperandSegments]>, + Arguments<(ins Arg:$memref, + BoolAttr:$targetIsRDNA, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)>, + Results<(outs AnyTypeOf<[BF16, F16, F32, I32, I8, + VectorOfLengthAndType<[2, 4], [F32, I32]>, + VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>, + VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value)> { + + let summary = "AMD GPU Raw Buffer load, exposing GCN features"; + let description = [{ + The `gpu.gcn_buffer_load` op is a wrapper around the buffer load intrinsics + available on AMD GPUs, including extensions in newer GPUs. + + The index into the buffer is computed as for `memref.load` with the additon + of `indexOffset` and `sgprOffset` (which is added after bounds checks and + includes any offset present on the memref type if it's non-zero). + + All indices and offsets are in units of the memref's data type and are + converted to bytes during lowering. + + When a load is out of bounds, the instruction returns zero. + Vector instructions bounds check each component's address. + + The memref struct is converted into a buffer resource (a V#) and the arguments + are translated to intrinsic arguments as follows: + - The base address of the buffer is the base address of the memref + - The stride is 0 to enable raw mode + - The number of records is the size of the memref, in bytes + In the case of dynamically-shaped memrefs, this is computed at runtime + as max_d (size(d) * stride(d)) * sizeof(elementType(memref)) + - The offset enable bit is 1, the index enable bit is 0. + - The thread ID addition bit is off + - If `boundsCheck` is false and the target is RDNA, OOB_SELECT is set to 2 + to disable bounds checks, otherwise it is 0 + - The cache coherency bits are off + - `targetIsRDNA` controls the setting of some reserved values that differ + between RDNA and CDNA cores + }]; + let assemblyFormat = [{ + attr-dict $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($memref) `,` type($indices) `->` type($value) + }]; + let hasVerifier = 1; +} + +// GCN raw buffer store +def GPU_GCNRawBufferStoreOp : + GPU_Op<"gcn_raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>, + AttrSizedOperandSegments]>, + Arguments<(ins AnyTypeOf<[BF16, F16, F32, I32, I8, + VectorOfLengthAndType<[2, 4], [F32, I32]>, + VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>, + VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value, + Arg:$memref, + BoolAttr:$targetIsRDNA, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)> { + + let summary = "AMD GPU Raw Buffer Store, exposing GCN features"; + let description = [{ + The `gpu.gcn_raw_buffer_store` op is a wrapper around the buffer store intrinsics + available on AMD GPUs, including extensions in newer GPUs. + + The store index is computed as in `memref.store` with the addition of + `indexOffset` (which is included for uniformity with atomics and may be useful + when writing vectorized code) and `sgprOffset` (which is added after bounds checks + and implicitly includes the offset of the memref type if non-zero). + All index components are in terms of the elements of the memref, not bytes, + and are scaled up appropriately. + + Out of bounds stores are ignored in hardware, including the out of bounds + components of vector writes. + + See `gpu.gcn_raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $value `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasVerifier = 1; +} + +// GCN raw buffer atomic floating point add +def GPU_GCNRawBufferAtomicFaddOp : + GPU_Op<"gcn_raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, + AttrSizedOperandSegments]>, + Arguments<(ins F32:$value, + Arg:$memref, + BoolAttr:$targetIsRDNA, + Variadic:$indices, + DefaultValuedAttr:$boundsCheck, + OptionalAttr:$indexOffset, + Optional:$sgprOffset)> { + + let summary = "AMD GPU Raw Buffer Floating-point Atomic Add (MI-* only)"; + let description = [{ + The `gpu.gcn_raw_buffer_atomic_fadd` op is a wrapper around the + buffer-based atomic flooating point addition available on the MI-* series + of AMD GPUs. + + The index into the buffer is computed as for `memref.store` with the addition + of `indexOffset` (which is used to aid in emitting vectorized code) and, + if present `sgprOffset` (which is added after bounds checks and includes + any non-zero offset on the memref type). + + All indexing components are given in terms of the memref's element size, not + the byte lengths required by the intrinsic. + + Out of bounds atomic operations are ignored in hardware. + + See `gpu.gcn_raw_buffer_load` for a description of how the underlying + instruction is constructed. + }]; + let assemblyFormat = [{ + attr-dict $value `->` $memref `[` $indices `]` + (`sgprOffset` $sgprOffset^)? `:` + type($value) `->` type($memref) `,` type($indices) + }]; + let hasVerifier = 1; +} + def GPU_DeviceAsyncCopyOp : GPU_Op<"device_async_copy", [AttrSizedOperandSegments]> { let summary = "device-side asynchronous copy"; diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -179,4 +179,57 @@ let hasCustomAssemblyFormat = 1; } +//===---------------------------------------------------------------------===// +// Raw buffer load/store intrinsics + +def ROCDL_RawBufferLoadOp : + ROCDL_Op<"raw.buffer.load">, + Results<(outs LLVM_Type:$res)>, + Arguments<(ins LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)> { + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_load, {$rsrc, $offset, + $soffset, $aux}, {$_resultType}); + }]; + let hasCustomAssemblyFormat = 1; +} + +def ROCDL_RawBufferStoreOp : + ROCDL_Op<"raw.buffer.store">, + Arguments<(ins LLVM_Type:$vdata, + LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)>{ + string llvmBuilder = [{ + auto vdataType = moduleTranslation.convertType(op.vdata().getType()); + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_store, {$vdata, $rsrc, + $offset, $soffset, $aux}, {vdataType}); + }]; + let hasCustomAssemblyFormat = 1; +} + +//===---------------------------------------------------------------------===// +// MI-100 and MI-200 buffer atomic floating point add intrinsic + +def ROCDL_RawBufferAtomicFAddOp : + ROCDL_Op<"raw.buffer.atomic.fadd">, + Arguments<(ins LLVM_Type:$vdata, + LLVM_Type:$rsrc, + LLVM_Type:$offset, + LLVM_Type:$soffset, + LLVM_Type:$aux)>{ + string llvmBuilder = [{ + auto vdataType = moduleTranslation.convertType(op.vdata().getType()); + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_raw_buffer_atomic_fadd, {$vdata, $rsrc, + $offset, $soffset, $aux}, {vdataType}); + }]; + let hasCustomAssemblyFormat = 1; +} + #endif // ROCDLIR_OPS diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -18,6 +18,7 @@ #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" @@ -91,6 +92,223 @@ } }; +// Define lowering patterns for structured buffer ops +template +struct GCNRawBufferOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + static constexpr uint32_t maxVectorOpWidth = 128; + + Value createI32Constant(ConversionPatternRewriter &rewriter, Location loc, + int32_t value) const { + IntegerAttr valAttr = rewriter.getI32IntegerAttr(value); + Type llvmI32 = this->typeConverter->convertType(rewriter.getI32Type()); + return rewriter.create(loc, llvmI32, valAttr); + } + + LogicalResult + matchAndRewrite(GpuOp gpuOp, typename GpuOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = gpuOp.getLoc(); + Value memref = adaptor.memref(); + Value unconvertedMemref = gpuOp.memref(); + MemRefType memrefType = unconvertedMemref.getType().cast(); + + Value storeData = adaptor.getODSOperands(0)[0]; + if (storeData == memref) // no write component to this op + storeData = Value(); + Type wantedDataType; + if (storeData) + wantedDataType = storeData.getType(); + else + wantedDataType = gpuOp.getODSResults(0)[0].getType(); + + Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType); + + Type i32 = rewriter.getI32Type(); + Type llvmI32 = this->typeConverter->convertType(i32); + + int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8; + Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth); + + // If we want to load a vector with total size <= 32 + // bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32 + // and the + Type llvmBufferValType = llvmWantedDataType; + if (auto dataVector = wantedDataType.dyn_cast()) { + uint32_t elemBits = dataVector.getElementTypeBitWidth(); + uint32_t totalBits = elemBits * dataVector.getNumElements(); + if (totalBits > maxVectorOpWidth) + return gpuOp.emitOpError( + "Total width of loads or stores must be no more than " + + Twine(maxVectorOpWidth) + " bits, but we call for " + + Twine(totalBits) + + " bits. This should've been caught in validation"); + if (elemBits < 32) { + if (totalBits > 32) { + if (totalBits % 32 != 0) + return gpuOp.emitOpError("Load or store of more than 32-bits that " + "doesn't fit into words. Can't happen\n"); + llvmBufferValType = this->typeConverter->convertType( + VectorType::get(totalBits / 32, i32)); + } else { + llvmBufferValType = this->typeConverter->convertType( + rewriter.getIntegerType(totalBits)); + } + } + } + + SmallVector args; + if (storeData) { + if (llvmBufferValType != llvmWantedDataType) { + Value castForStore = + rewriter.create(loc, llvmBufferValType, storeData); + args.push_back(castForStore); + } else { + args.push_back(storeData); + } + } + + // Construct buffer descriptor from memref, attributes + int64_t offset = 0; + SmallVector strides; + if (failed(getStridesAndOffset(memrefType, strides, offset))) + return gpuOp.emitOpError("Can't lower non-stride-offset memrefs"); + + // Resource descriptor + // bits 0-47: base address + // bits 48-61: stride (0 for raw buffers) + // bit 62: texture cache coherency (always 0) + // bit 63: enable swizzles (always off for raw buffers) + // bits 64-95 (word 2): Number of records, units of stride + // bits 96-127 (word 3): See below + + Type llvm4xI32 = this->typeConverter->convertType(VectorType::get(4, i32)); + MemRefDescriptor memrefDescriptor(memref); + Type llvmI64 = this->typeConverter->convertType(rewriter.getI64Type()); + Type llvm2xI32 = this->typeConverter->convertType(VectorType::get(2, i32)); + + Value resource = rewriter.create(loc, llvm4xI32); + + Value ptr = memrefDescriptor.alignedPtr(rewriter, loc); + Value ptrAsInt = rewriter.create(loc, llvmI64, ptr); + Value ptrAsInts = + rewriter.create(loc, llvm2xI32, ptrAsInt); + for (int64_t i = 0; i < 2; ++i) { + Value idxConst = this->createIndexConstant(rewriter, loc, i); + Value part = + rewriter.create(loc, ptrAsInts, idxConst); + resource = rewriter.create( + loc, llvm4xI32, resource, part, idxConst); + } + + Value numRecords; + if (memrefType.hasStaticShape()) { + numRecords = createI32Constant( + rewriter, loc, + static_cast(memrefType.getNumElements() * elementByteWidth)); + } else { + Value maxIndex; + for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) { + Value size = memrefDescriptor.size(rewriter, loc, i); + Value stride = memrefDescriptor.stride(rewriter, loc, i); + stride = rewriter.create(loc, stride, byteWidthConst); + Value maxThisDim = rewriter.create(loc, size, stride); + maxIndex = maxIndex ? rewriter.create(loc, maxIndex, + maxThisDim) + : maxThisDim; + } + numRecords = rewriter.create(loc, llvmI32, maxIndex); + } + resource = rewriter.create( + loc, llvm4xI32, resource, numRecords, + this->createIndexConstant(rewriter, loc, 2)); + + // Final word: + // bits 0-11: dst sel, ignored by these intrinsics + // bits 12-14: data format (ignored, must be nonzero, 7=float) + // bits 15-18: data format (ignored, must be nonzero, 4=32bit) + // bit 19: In nested heap (0 here) + // bit 20: Behavior on unmap (0 means "return 0 / ignore") + // bits 21-22: Index stride for swizzles (N/A) + // bit 23: Add thread ID (0) + // bit 24: Reserved to 1 (RDNA) or 0 (CDNA) + // bits 25-26: Reserved (0) + // bit 27: Buffer is non-volatile (CDNA only) + // bits 28-29: Out of bounds select (0 = structured, 1 = raw, 2 = none, 3 = + // swizzles) RDNA only + // bits 30-31: Type (must be 0) + uint32_t word3 = (7 << 12) | (4 << 15); + if (adaptor.targetIsRDNA()) { + word3 |= (1 << 24); + uint32_t oob = adaptor.boundsCheck() ? 1 : 2; + word3 |= (oob << 28); + } + Value word3Const = createI32Constant(rewriter, loc, word3); + resource = rewriter.create( + loc, llvm4xI32, resource, word3Const, + this->createIndexConstant(rewriter, loc, 3)); + args.push_back(resource); + + // Indexing (voffset) + Value voffset; + for (auto &pair : llvm::enumerate(adaptor.indices())) { + size_t i = pair.index(); + Value index = pair.value(); + Value strideOp; + if (ShapedType::isDynamicStrideOrOffset(strides[i])) { + strideOp = rewriter.create( + loc, memrefDescriptor.stride(rewriter, loc, i), byteWidthConst); + } else { + strideOp = + createI32Constant(rewriter, loc, strides[i] * elementByteWidth); + } + index = rewriter.create(loc, index, strideOp); + voffset = + voffset ? rewriter.create(loc, voffset, index) : index; + } + if (adaptor.indexOffset().hasValue()) { + int32_t indexOffset = *gpuOp.indexOffset() * elementByteWidth; + Value extraOffsetConst = createI32Constant(rewriter, loc, indexOffset); + voffset = + voffset ? rewriter.create(loc, voffset, extraOffsetConst) + : extraOffsetConst; + } + args.push_back(voffset); + + Value sgprOffset = adaptor.sgprOffset(); + if (!sgprOffset) + sgprOffset = createI32Constant(rewriter, loc, 0); + if (ShapedType::isDynamicStrideOrOffset(offset)) + sgprOffset = rewriter.create( + loc, memrefDescriptor.offset(rewriter, loc), sgprOffset); + else if (offset > 0) + sgprOffset = rewriter.create( + loc, sgprOffset, createI32Constant(rewriter, loc, offset)); + args.push_back(sgprOffset); + + // bit 0: GLC = 0 (atomics drop value, less coherency) + // bits 1-2: SLC, DLC = 0 (similarly) + // bit 3: swizzled (0 for raw) + args.push_back(createI32Constant(rewriter, loc, 0)); + + llvm::SmallVector resultTypes(gpuOp->getNumResults(), + llvmBufferValType); + Operation *lowered = rewriter.create(loc, resultTypes, args, + ArrayRef()); + if (lowered->getNumResults() == 1) { + Value replacement = lowered->getResults()[0]; + if (llvmBufferValType != llvmWantedDataType) { + replacement = rewriter.create(loc, llvmWantedDataType, + replacement); + } + rewriter.replaceOp(gpuOp, replacement); + } else { + rewriter.eraseOp(gpuOp); + } + return success(); + } +}; } // namespace void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { @@ -169,6 +387,12 @@ "__ocml_sqrt_f64"); patterns.add>(converter, "__ocml_tanh_f32", "__ocml_tanh_f64"); + + patterns.add< + GCNRawBufferOpLowering, + GCNRawBufferOpLowering, + GCNRawBufferOpLowering>(converter); } std::unique_ptr> diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1159,6 +1159,38 @@ return success(); } +//===----------------------------------------------------------------------===// +// GPU_GCNRawBuffer*Op +//===----------------------------------------------------------------------===// +namespace { +template +LogicalResult verifyGCNRawBufferOp(T &op) { + MemRefType bufferType = op.memref().getType().template cast(); + if (bufferType.getMemorySpaceAsInt() != 0) + return op.emitOpError( + "Buffer ops must operate on a memref in global memory"); + if (!bufferType.hasRank()) + return op.emitOpError( + "Cannot meaningfully buffer_store to an unranked memref"); + if (static_cast(op.indices().size()) != bufferType.getRank()) + return op.emitOpError("Expected " + Twine(bufferType.getRank()) + + " indices to memref"); + return success(); +} +} // end anonymous namespace + +LogicalResult GCNRawBufferLoadOp::verify() { + return verifyGCNRawBufferOp(*this); +} + +LogicalResult GCNRawBufferStoreOp::verify() { + return verifyGCNRawBufferOp(*this); +} + +LogicalResult GCNRawBufferAtomicFaddOp::verify() { + return verifyGCNRawBufferOp(*this); +} + /// This is a common class used for patterns of the form /// "someop(memrefcast) -> someop". It folds the source of any memref.cast /// into the root operation directly. diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp --- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp @@ -84,6 +84,76 @@ p << " " << getOperands() << " : " << vdata().getType(); } +// ::= +// `llvm.amdgcn.raw.buffer.load.* %rsrc, %offset, %soffset, %aux +// : result_type` +ParseResult RawBufferLoadOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 4) || parser.parseColonType(type) || + parser.addTypeToList(type, result.types)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + return parser.resolveOperands(ops, {i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands); +} + +void RawBufferLoadOp::print(OpAsmPrinter &p) { + p << " " << getOperands() << " : " << res().getType(); +} + +// ::= +// `llvm.amdgcn.raw.buffer.store.* %vdata, %rsrc, %offset, +// %soffset, %aux : result_type` +ParseResult RawBufferStoreOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 5) || parser.parseColonType(type)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + + if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands)) + return failure(); + return success(); +} + +void RawBufferStoreOp::print(OpAsmPrinter &p) { + p << " " << getOperands() << " : " << vdata().getType(); +} + +// ::= +// `llvm.amdgcn.raw.buffer.atomic.fadd.* %vdata, %rsrc, %offset, +// %soffset, %aux : result_type` +ParseResult RawBufferAtomicFAddOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector ops; + Type type; + if (parser.parseOperandList(ops, 5) || parser.parseColonType(type)) + return failure(); + + auto bldr = parser.getBuilder(); + auto int32Ty = bldr.getI32Type(); + auto i32x4Ty = VectorType::get({4}, int32Ty); + + if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty}, + parser.getNameLoc(), result.operands)) + return failure(); + return success(); +} + +void RawBufferAtomicFAddOp::print(mlir::OpAsmPrinter &p) { + p << " " << getOperands() << " : " << vdata().getType(); +} + //===----------------------------------------------------------------------===// // ROCDLDialect initialization, type parsing, and registration. //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -386,3 +386,125 @@ gpu.return } } + +// ----- + +gpu.module @test_module { + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32 + gpu.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: return %[[ret]] + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi32>, i32 -> i32 + gpu.return %0 : i32 + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_rdna + gpu.func @gpu_gcn_raw_buffer_load_i32_rdna(%buf: memref<64xi32>, %idx: i32) -> i32 { + // CHECK: %[[word3:.*]] = llvm.mlir.constant(285372416 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: return %[[ret]] + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, targetIsRDNA = true} %buf[%idx] : memref<64xi32>, i32 -> i32 + gpu.return %0 : i32 + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_rdna_oob_off + gpu.func @gpu_gcn_raw_buffer_load_i32_rdna_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 { + // CHECK: %[[word3:.*]] = llvm.mlir.constant(553807872 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + // CHECK: return %[[ret]] + %0 = gpu.gcn_raw_buffer_load {boundsCheck = false, targetIsRDNA = true} %buf[%idx] : memref<64xi32>, i32 -> i32 + gpu.return %0 : i32 + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi32 + gpu.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vector<2xi32> { + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32> + // CHECK: return %[[ret]] + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi32>, i32 -> vector<2xi32> + gpu.return %0 : vector<2xi32> + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8 + gpu.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i8 + // CHECK: return %[[ret]] + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> i8 + gpu.return %0 : i8 + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8 + gpu.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 + // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8> + // CHECK: return %[[ret]] + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> vector<2xi8> + gpu.return %0 : vector<2xi8> + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_16xi8 + gpu.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vector<16xi8> { + // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> + // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : vector<4xi32> to vector<16xi8> + // CHECK: return %[[ret]] + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> vector<16xi8> + gpu.return %0 : vector<16xi8> + } +} + +// ----- + +// Since the lowering logic is shared with loads, only bitcasts need to be rechecked +gpu.module @test_module { + // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32 + gpu.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 + gpu.gcn_raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 + gpu.return + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_2xi8 + gpu.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi8>, %idx: i32) { + // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<2xi8> to i16 + // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 + gpu.gcn_raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : vector<2xi8> -> memref<64xi8>, i32 + gpu.return + } + + // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_16xi8 + gpu.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64xi8>, %idx: i32) { + // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32> + // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> + gpu.gcn_raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : vector<16xi8> -> memref<64xi8>, i32 + gpu.return + } +} + +// ----- + +// And more so for atomic add +gpu.module @test_module { + // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32 + gpu.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: llvm.insertelement{{.*}}%[[numRecords]] + // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32) + // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]] + // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 + gpu.gcn_raw_buffer_atomic_fadd {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 + gpu.return + } +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -259,6 +259,62 @@ return } + // CHECK-LABEL: func @buffer_load_f32_from_rank_1 + func @buffer_load_f32_from_rank_1(%src : memref<128xf32>, %offset : i32, %idx0 : i32) -> f32 { + // CHECK: gpu.gcn_raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[{{.*}}] sgprOffset %{{.*}} : memref<128xf32>, i32 -> f32 + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0] sgprOffset %offset : memref<128xf32>, i32 -> f32 + return %0 : f32 + } + + // CHECK-LABEL: func @buffer_load_f32_from_rank_4 + func @buffer_load_f32_from_rank_4(%src : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) -> f32 { + // CHECK: gpu.gcn_raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> f32 + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> f32 + return %0 : f32 + } + + // CHECK-LABEL: func @buffer_load_4xf32_from_rank_4 + func @buffer_load_4xf32_from_rank_4(%src : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) -> vector<4xf32> { + // CHECK: gpu.gcn_raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32> + %0 = gpu.gcn_raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32> + return %0 : vector<4xf32> + } + + // CHECK-LABEL: func @buffer_store_f32_to_rank_1 + func @buffer_store_f32_to_rank_1(%value : f32, %dst : memref<128xf32>, %offset : i32, %idx0 : i32) { + // CHECK: gpu.gcn_raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128xf32>, i32 + gpu.gcn_raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0] sgprOffset %offset : f32 -> memref<128xf32>, i32 + return + } + + // CHECK-LABEL: func @buffer_store_f32_to_rank_4 + func @buffer_store_f32_to_rank_4(%value : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) { + // CHECK: gpu.gcn_raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + gpu.gcn_raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + return + } + + // CHECK-LABEL: func @buffer_store_4xf32_to_rank_4 + func @buffer_store_4xf32_to_rank_4(%value : vector<4xf32>, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) { + // CHECK: gpu.gcn_raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : vector<4xf32> -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + gpu.gcn_raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : vector<4xf32> -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + return + } + + // CHECK-LABEL: func @buffer_atomic_fadd_f32_to_rank_1 + func @buffer_atomic_fadd_f32_to_rank_1(%value : f32, %dst : memref<128xf32>, %offset : i32, %idx0 : i32) { + // CHECK: gpu.gcn_raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128xf32>, i32 + gpu.gcn_raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0] sgprOffset %offset : f32 -> memref<128xf32>, i32 + return + } + + // CHECK-LABEL: func @buffer_atomic_fadd_f32_to_rank_4 + func @buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) { + // CHECK: gpu.gcn_raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + gpu.gcn_raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 + return + } + // CHECK-LABEL: func @set_default_device func @set_default_device(%arg0: i32) { // CHECK: gpu.set_default_device diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -167,6 +167,34 @@ llvm.return } +llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %aux : i32, %vdata1 : f32, + %vdata2 : vector<2xf32>, %vdata4 : vector<4xf32>) { + // CHECK-LABEL: rocdl.raw.buffer + // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32 + // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<2xf32> + // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<4xf32> + + // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32 + // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<2xf32> + // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<4xf32> + + // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32 + + %r1 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : f32 + %r2 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xf32> + %r4 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xf32> + + rocdl.raw.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : f32 + rocdl.raw.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xf32> + rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %offset, %aux : vector<4xf32> + + rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + + llvm.return +} + // ----- // expected-error@below {{attribute attached to unexpected op}} diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -169,21 +169,61 @@ %slc = llvm.mlir.constant(true) : i1 // CHECK-LABEL: rocdl.mubuf // CHECK: call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - %r1 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> // CHECK: call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - %r2 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> // CHECK: call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - %r4 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32> // CHECK: call void @llvm.amdgcn.buffer.store.v1f32(<1 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - rocdl.buffer.store %vdata1, %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> // CHECK: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) - rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> // CHECK: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}}) + + %r1 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> + %r2 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> + %r4 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32> + + rocdl.buffer.store %vdata1, %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32> + rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32> rocdl.buffer.store %vdata4, %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32> llvm.return } +llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %vdata1 : i32, + %vdata2 : vector<2xi32>, + %vdata4 : vector<4xi32>) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.buffer + // CHECK: call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + // CHECK: call void @llvm.amdgcn.raw.buffer.store.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + // CHECK: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + %r1 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : i32 + %r2 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xi32> + %r4 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xi32> + + rocdl.raw.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : i32 + rocdl.raw.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xi32> + rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %soffset, %aux : vector<4xi32> + + llvm.return +} + +llvm.func @rocdl.raw.buffer.atomic(%rsrc : vector<4xi32>, + %offset : i32, %soffset : i32, + %vdata1 : f32) { + %aux = llvm.mlir.constant(0 : i32) : i32 + // CHECK-LABEL: rocdl.raw.buffer.atomic + // CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} + + rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 + + llvm.return +} + // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024"