Diff 553214

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Show First 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	class ROCDL_MbcntOp<string mnemonic> :
let assemblyFormat = [{		let assemblyFormat = [{
$in0 `,` $in1 attr-dict `:` `(` type($in0) `,` type($in1) `)` `->` type($res)		$in0 `,` $in1 attr-dict `:` `(` type($in0) `,` type($in1) `)` `->` type($res)
}];		}];
}		}

def ROCDL_MbcntLoOp : ROCDL_MbcntOp<"lo">;		def ROCDL_MbcntLoOp : ROCDL_MbcntOp<"lo">;
def ROCDL_MbcntHiOp : ROCDL_MbcntOp<"hi">;		def ROCDL_MbcntHiOp : ROCDL_MbcntOp<"hi">;

		def ROCDL_DsSwizzleOp :
		ROCDL_Op<"ds_swizzle">,
		Results<(outs I32:$res)>,
		Arguments<(ins I32:$src,
		I32:$offset)>
		{
		string llvmBuilder = [{
		$res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_ds_swizzle, {$src, $offset});
		}];
		let assemblyFormat = [{
		$src `,` $offset attr-dict `:` `(` type($src) `,` type($offset) `)` `->` type($res)
		}];
		}

		def ROCDL_DsBpermuteOp :
		ROCDL_Op<"ds_bpermute">,
		Results<(outs I32:$res)>,
		Arguments<(ins I32:$index,
		I32:$src)>
		{
		string llvmBuilder = [{
		$res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_ds_bpermute, {$index, $src});
		}];
		let assemblyFormat = [{
		$index `,` $src attr-dict `:` `(` type($index) `,` type($src) `)` `->` type($res)
		}];
		}


//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Thread index and Block index		// Thread index and Block index

def ROCDL_ThreadIdXOp : ROCDL_SpecialRegisterOp<"workitem.id.x">;		def ROCDL_ThreadIdXOp : ROCDL_SpecialRegisterOp<"workitem.id.x">;
def ROCDL_ThreadIdYOp : ROCDL_SpecialRegisterOp<"workitem.id.y">;		def ROCDL_ThreadIdYOp : ROCDL_SpecialRegisterOp<"workitem.id.y">;
def ROCDL_ThreadIdZOp : ROCDL_SpecialRegisterOp<"workitem.id.z">;		def ROCDL_ThreadIdZOp : ROCDL_SpecialRegisterOp<"workitem.id.z">;

def ROCDL_BlockIdXOp : ROCDL_SpecialRegisterOp<"workgroup.id.x">;		def ROCDL_BlockIdXOp : ROCDL_SpecialRegisterOp<"workgroup.id.x">;
▲ Show 20 Lines • Show All 372 Lines • Show Last 20 Lines

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Show First 20 Lines • Show All 58 Lines • ▼ Show 20 Lines
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {		static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
bool canBeBare = true;		bool canBeBare = true;
for (Type type : func.getArgumentTypes())		for (Type type : func.getArgumentTypes())
if (auto memrefTy = dyn_cast<BaseMemRefType>(type))		if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);		canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
return canBeBare;		return canBeBare;
}		}

		Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
		const unsigned indexBitwidth) {
		auto int32Type = IntegerType::get(rewriter.getContext(), 32);
		Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);
		Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);
		Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
		ValueRange{minus1, zero});
		Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
		ValueRange{minus1, mbcntLo});
		return laneId;
		}

namespace {		namespace {
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {		struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;		using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;

LogicalResult		LogicalResult
matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,		matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {		ConversionPatternRewriter &rewriter) const override {
auto loc = op->getLoc();		auto loc = op->getLoc();
Show All 18 Lines	if (indexBitwidth > 32) {
laneId = rewriter.create<LLVM::TruncOp>(		laneId = rewriter.create<LLVM::TruncOp>(
loc, IntegerType::get(context, indexBitwidth), laneId);		loc, IntegerType::get(context, indexBitwidth), laneId);
}		}
rewriter.replaceOp(op, {laneId});		rewriter.replaceOp(op, {laneId});
return success();		return success();
}		}
};		};

		struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
		using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;

		/// Lowers a shuffle to the corresponding ROCDL ops.
		antiagainstUnsubmitted Done Reply Inline Actions s/NVVM/ROCDL/ antiagainst: s/NVVM/ROCDL/
		///
		/// Use the `width` argument to see if src lane is participating.
		/// If not the dstLane would be itself.
		///
		/// Shuffle with DS Bpermute:
		/// let shflMode = [xor, up, down, idx]
		/// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
		/// 1. curLaneId = using mbcnt.lo + mbcnt.hi
		/// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
		/// 3. dstLane = shflMode(curLaneId, step)
		/// 4. isActiveSrcLane = dstLane < isActiveSrcLane
		/// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
		/// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
		/// 7. bpermute(dwordAlignedDstLane, shfl_value).
		///
		LogicalResult
		matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		Location loc = op->getLoc();
		// TODO: Add support for non 32-bit shuffle values.
		if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
		return failure();
		const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
		Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);

		auto int32Type = IntegerType::get(rewriter.getContext(), 32);
		Value width = adaptor.getWidth();
		Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
		Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
		Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
		Value widthOrZeroIfOutside =
		rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
		Value dstLane;
		// TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN.
		// TODO: Use ds_swizzle for XOR when step/offsets are constants for better
		// perf.
		switch (op.getMode()) {
		case gpu::ShuffleMode::XOR:
		dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
		adaptor.getOffset());
		break;
		case gpu::ShuffleMode::IDX:
		dstLane = adaptor.getOffset();
		break;
		default:
		return failure();
		}
		Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
		loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
		Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
		dstLane, srcLaneId);
		Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
		Value dwordAlignedDstLane =
		rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
		Value initShflValue = adaptor.getValue();
		if (adaptor.getValue().getType().isF32()) {
		initShflValue =
		rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
		}
		Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(
		loc, int32Type, dwordAlignedDstLane, initShflValue);
		if (adaptor.getValue().getType().isF32()) {
		shflValue = rewriter.create<LLVM::BitcastOp>(
		loc, adaptor.getValue().getType(), shflValue);
		}
		rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
		return success();
		}
		};

/// Import the GPU Ops to ROCDL Patterns.		/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"		#include "GPUToROCDL.cpp.inc"

// A pass that replaces all occurrences of GPU device operations with their		// A pass that replaces all occurrences of GPU device operations with their
// corresponding ROCDL equivalent.		// corresponding ROCDL equivalent.
//		//
// This pass only handles device code and is not meant to be run on GPU host		// This pass only handles device code and is not meant to be run on GPU host
// code.		// code.
▲ Show 20 Lines • Show All 164 Lines • ▼ Show 20 Lines	patterns.add<GPUFuncOpLowering>(
ROCDL::ROCDLDialect::getKernelFuncAttrName()));		ROCDL::ROCDLDialect::getKernelFuncAttrName()));
if (Runtime::HIP == runtime) {		if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);		patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {		} else if (Runtime::OpenCL == runtime) {
// Use address space = 4 to match the OpenCL definition of printf()		// Use address space = 4 to match the OpenCL definition of printf()
patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /addressSpace=/4);		patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /addressSpace=/4);
}		}

patterns.add<GPULaneIdOpToROCDL>(converter);		patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);

populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32",		populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32",
"__ocml_fabs_f64");		"__ocml_fabs_f64");
populateOpPatterns<math::AtanOp>(converter, patterns, "__ocml_atan_f32",		populateOpPatterns<math::AtanOp>(converter, patterns, "__ocml_atan_f32",
"__ocml_atan_f64");		"__ocml_atan_f64");
populateOpPatterns<math::Atan2Op>(converter, patterns, "__ocml_atan2_f32",		populateOpPatterns<math::Atan2Op>(converter, patterns, "__ocml_atan2_f32",
"__ocml_atan2_f64");		"__ocml_atan2_f64");
populateOpPatterns<math::CbrtOp>(converter, patterns, "__ocml_cbrt_f32",		populateOpPatterns<math::CbrtOp>(converter, patterns, "__ocml_cbrt_f32",
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

	Show First 20 Lines • Show All 518 Lines • ▼ Show 20 Lines
	// CHECK-LABEL: @spirv_exp			// CHECK-LABEL: @spirv_exp
	// CHECK: llvm.call @__ocml_exp_f32			// CHECK: llvm.call @__ocml_exp_f32
	spirv.func @spirv_exp(%arg0: vector<4xf32>) -> vector<4xf32> "None" {			spirv.func @spirv_exp(%arg0: vector<4xf32>) -> vector<4xf32> "None" {
	%0 = math.exp %arg0 : vector<4xf32>			%0 = math.exp %arg0 : vector<4xf32>
	spirv.ReturnValue %0 : vector<4xf32>			spirv.ReturnValue %0 : vector<4xf32>
	}			}
	}			}

				// -----

				gpu.module @test_module {
				// CHECK-LABEL: func @gpu_all_reduce_op()
				gpu.func @gpu_all_reduce_op() {
				%arg0 = arith.constant 1.0 : f32
				// TODO: Check full IR expansion once lowering has settled.
				// CHECK: llvm.add
				// CHECK: llvm.and
				// CHECK: llvm.xor
				// CHECK: llvm.icmp "slt"
				// CHECK: llvm.select
				// CHECK: llvm.shl
				// CHECK: rocdl.ds_bpermute {{.*}}
				// CHECK: rocdl.barrier
				// CHECK: llvm.bitcast
				// CHECK: llvm.fadd
				%result = gpu.all_reduce add %arg0 uniform {} : (f32) -> (f32)

				gpu.return
				}
				}


				// -----

				gpu.module @test_module {
				// CHECK-LABEL: func @gpu_all_reduce_region()
				gpu.func @gpu_all_reduce_region() {
				%arg0 = arith.constant 1 : i32
				// TODO: Check full IR expansion once lowering has settled.
				// CHECK: llvm.add
				// CHECK: llvm.and
				// CHECK: llvm.xor
				// CHECK: llvm.icmp "slt"
				// CHECK: llvm.select
				// CHECK: llvm.shl
				// CHECK: rocdl.ds_bpermute {{.*}}
				// CHECK: rocdl.barrier
				%result = gpu.all_reduce %arg0 uniform {
				^bb(%lhs : i32, %rhs : i32):
				%xor = arith.xori %lhs, %rhs : i32
				"gpu.yield"(%xor) : (i32) -> ()
				} : (i32) -> (i32)
				gpu.return
				}
				}

				// -----

				gpu.module @test_module {
				// CHECK-LABEL: func @gpu_shuffle()
				func.func @gpu_shuffle() -> (f32, f32) {
				// CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
				%arg0 = arith.constant 1.0 : f32
				// CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
				%arg1 = arith.constant 4 : i32
				// CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32
				%arg2 = arith.constant 23 : i32
				// CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
				// CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
				// CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32
				// CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
				// CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32
				// CHECK: %[[#XOR:]] = llvm.xor %[[#LANE_ID]], %{{.*}} : i32
				// CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#XOR]], %[[#WARP_OR_ZERO]] : i32
				// CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#XOR]], %{{.*}} : i1, i32
				// CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32
				// CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32
				// CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
				// CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
				// CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
				%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
				// CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
				// CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
				// CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32
				// CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
				// CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32
				// CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#OFFSET]], %[[#WARP_OR_ZERO]] : i32
				// CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#OFFSET]], %{{.*}} : i1, i32
				// CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32
				// CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32
				// CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
				// CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
				// CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
				%shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
				func.return %shfl, %shfli : f32, f32
				}
				}
				No newline at end of file

mlir/test/Conversion/GPUToROCDL/invalid.mlir

This file was deleted.

	// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics


	// Demonstrate the need to register the cf and memref dialect as dependent.
	// CHECK-LABEL: @dependentDialect
	gpu.module @module {
	gpu.func @dependentDialect() {
	%arg0 = arith.constant 1 : i32
	// expected-error@+1 {{failed to legalize operation 'gpu.shuffle' that was explicitly marked illega}}
	%result = gpu.all_reduce %arg0 uniform {
	^bb(%lhs : i32, %rhs : i32):
	%xor = arith.xori %lhs, %rhs : i32
	"gpu.yield"(%xor) : (i32) -> ()
	} : (i32) -> (i32)
	gpu.return
	}
	}

mlir/test/Target/LLVMIR/rocdl.mlir

Show First 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	llvm.func @rocdl.lane_id() -> i32 {
// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])		// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
%0 = llvm.mlir.constant(-1 : i32) : i32		%0 = llvm.mlir.constant(-1 : i32) : i32
%1 = llvm.mlir.constant(0 : i32) : i32		%1 = llvm.mlir.constant(0 : i32) : i32
%2 = rocdl.mbcnt.lo %0, %1 : (i32, i32) -> i32		%2 = rocdl.mbcnt.lo %0, %1 : (i32, i32) -> i32
%3 = rocdl.mbcnt.hi %0, %2 : (i32, i32) -> i32		%3 = rocdl.mbcnt.hi %0, %2 : (i32, i32) -> i32
llvm.return %3 : i32		llvm.return %3 : i32
}		}

		llvm.func @rocdl.swizzle(%src : i32) -> i32 {
		// CHECK-LABEL: rocdl.swizzle
		// CHECK: call i32 @llvm.amdgcn.ds.swizzle
		%offset = llvm.mlir.constant(100 : i32) : i32
		%0 = rocdl.ds_swizzle %src, %offset : (i32, i32) -> i32
		llvm.return %0 : i32
		}

		llvm.func @rocdl.bpermute(%src : i32) -> i32 {
		// CHECK-LABEL: rocdl.bpermute
		// CHECK: call i32 @llvm.amdgcn.ds.bpermute
		%index = llvm.mlir.constant(10 : i32) : i32
		%0 = rocdl.ds_bpermute %index, %src : (i32, i32) -> i32
		llvm.return %0 : i32
		}

llvm.func @rocdl.barrier() {		llvm.func @rocdl.barrier() {
// CHECK: fence syncscope("workgroup") release		// CHECK: fence syncscope("workgroup") release
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()		// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
// CHECK-NEXT: fence syncscope("workgroup") acquire		// CHECK-NEXT: fence syncscope("workgroup") acquire
rocdl.barrier		rocdl.barrier
llvm.return		llvm.return
}		}

▲ Show 20 Lines • Show All 307 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][ROCM] Add Wave/Warp shuffle lowering and op for ROCM.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 553214

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

mlir/test/Conversion/GPUToROCDL/invalid.mlir

mlir/test/Target/LLVMIR/rocdl.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][ROCM] Add Wave/Warp shuffle lowering and op for ROCM.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 553214

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

mlir/test/Conversion/GPUToROCDL/invalid.mlir

mlir/test/Target/LLVMIR/rocdl.mlir

[mlir][ROCM] Add Wave/Warp shuffle lowering and op for ROCM.
ClosedPublic