This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Conversion/GPUToNVVM/
-
Conversion/
-
GPUToNVVM/
1/2
LowerGpuOpsToNVVMOps.cpp
-
test/Conversion/GPUToNVVM/
-
Conversion/
-
GPUToNVVM/
-
gpu-to-nvvm.mlir

Differential D118086

Fix bugs in GPUToNVVM lowering
ClosedPublic

Authored by harsh on Jan 24 2022, 4:02 PM.

Download Raw Diff

Details

Reviewers

herhut
ThomasRaoux

Commits

rGe01e4c9115ad: Fix bugs in GPUToNVVM lowering

Summary

The current lowering from GPU to NVVM does
not correctly handle the following cases when
lowering the gpu shuffle op.

When the active width is set to 32 (all lanes),

then the current approach computes (1 << 32) -1 which
results in poison values in the LLVM IR. We fix this by
defining the active mask as (-1) >> (32 - width).

In the case of shuffle up, the computation of the third

operand c has to be different from the other 3 modes due to
the op definition in the ISA reference.
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html)
Specifically, the predicate value is computed as j >= maxLane
for up and j <= maxLane for all other modes. We fix this by
computing maskAndClamp as 32 - width for this mode.

TEST: We modify the existing test and add more checks for the up mode.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

harsh created this revision.Jan 24 2022, 4:02 PM

Herald added subscribers: sdasgup3, wenzhicui, wrengr and 22 others. · View Herald TranscriptJan 24 2022, 4:02 PM

harsh requested review of this revision.Jan 24 2022, 4:02 PM

Herald added a reviewer: herhut. · View Herald TranscriptJan 24 2022, 4:02 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

LG, please address the variable naming before landing the patch.

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
95	this is actually the number of inactive lanes? Should probably be named `numLeadInactiveLane`?

This revision is now accepted and ready to land.Jan 24 2022, 5:25 PM

harsh added inline comments.Jan 24 2022, 6:15 PM

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
95	yes you are right. will change.

Updated patch based on Thomas' comments.

LGTM

This revision was landed with ongoing or failed builds.Jan 24 2022, 7:25 PM

Closed by commit rGe01e4c9115ad: Fix bugs in GPUToNVVM lowering (authored by harsh). · Explain Why

This revision was automatically updated to reflect the committed changes.

harsh added a commit: rGe01e4c9115ad: Fix bugs in GPUToNVVM lowering.

Harbormaster completed remote builds in B145371: Diff 402728.Jan 26 2022, 12:39 PM

Revision Contents

Path

Size

mlir/

lib/

Conversion/

GPUToNVVM/

LowerGpuOpsToNVVMOps.cpp

32 lines

test/

Conversion/

GPUToNVVM/

gpu-to-nvvm.mlir

15 lines

Diff 402708

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

Show First 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {

/// Lowers a shuffle to the corresponding NVVM op.		/// Lowers a shuffle to the corresponding NVVM op.
///		///
/// Convert the `width` argument into an activeMask (a bitmask which specifies		/// Convert the `width` argument into an activeMask (a bitmask which specifies
/// which threads participate in the shuffle) and a maskAndClamp (specifying		/// which threads participate in the shuffle) and a maskAndClamp (specifying
/// the highest lane which participates in the shuffle).		/// the highest lane which participates in the shuffle).
///		///
/// %one = llvm.constant(1 : i32) : i32		/// %one = llvm.constant(1 : i32) : i32
/// %shl = llvm.shl %one, %width : i32		/// %minus_one = llvm.constant(-1 : i32) : i32
/// %active_mask = llvm.sub %shl, %one : i32		/// %thirty_two = llvm.constant(32 : i32) : i32
		/// %num_lanes = llvm.sub %thirty_two, %width : i32
		/// %active_mask = llvm.lshr %minus_one, %num_lanes : i32
/// %mask_and_clamp = llvm.sub %width, %one : i32		/// %mask_and_clamp = llvm.sub %width, %one : i32
/// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,		/// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
/// %mask_and_clamp : !llvm<"{ float, i1 }">		/// %mask_and_clamp : !llvm<"{ float, i1 }">
/// %shfl_value = llvm.extractvalue %shfl[0 : index] :		/// %shfl_value = llvm.extractvalue %shfl[0 : index] :
/// !llvm<"{ float, i1 }">		/// !llvm<"{ float, i1 }">
/// %shfl_pred = llvm.extractvalue %shfl[1 : index] :		/// %shfl_pred = llvm.extractvalue %shfl[1 : index] :
/// !llvm<"{ float, i1 }">		/// !llvm<"{ float, i1 }">
LogicalResult		LogicalResult
matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,		matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {		ConversionPatternRewriter &rewriter) const override {
Location loc = op->getLoc();		Location loc = op->getLoc();

auto valueTy = adaptor.value().getType();		auto valueTy = adaptor.value().getType();
auto int32Type = IntegerType::get(rewriter.getContext(), 32);		auto int32Type = IntegerType::get(rewriter.getContext(), 32);
auto predTy = IntegerType::get(rewriter.getContext(), 1);		auto predTy = IntegerType::get(rewriter.getContext(), 1);
auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),		auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
{valueTy, predTy});		{valueTy, predTy});

Value one = rewriter.create<LLVM::ConstantOp>(		Value one = rewriter.create<LLVM::ConstantOp>(
loc, int32Type, rewriter.getI32IntegerAttr(1));		loc, int32Type, rewriter.getI32IntegerAttr(1));
// Bit mask of active lanes: `(1 << activeWidth) - 1`.		Value minusOne = rewriter.create<LLVM::ConstantOp>(
Value activeMask = rewriter.create<LLVM::SubOp>(		loc, int32Type, rewriter.getI32IntegerAttr(-1));
loc, int32Type,		Value thirtyTwo = rewriter.create<LLVM::ConstantOp>(
rewriter.create<LLVM::ShlOp>(loc, int32Type, one, adaptor.width()),		loc, int32Type, rewriter.getI32IntegerAttr(32));
one);		Value numActiveLanes = rewriter.create<LLVM::SubOp>(
		ThomasRaouxUnsubmitted Not Done Reply Inline Actions this is actually the number of inactive lanes? Should probably be named `numLeadInactiveLane`? ThomasRaoux: this is actually the number of inactive lanes? Should probably be named `numLeadInactiveLane`?
		harshAuthorUnsubmitted Done Reply Inline Actions yes you are right. will change. harsh: yes you are right. will change.
		loc, int32Type, thirtyTwo, adaptor.width());
		// Bit mask of active lanes: `(-1) >> (32 - activeWidth)`.
		Value activeMask =
		rewriter.create<LLVM::LShrOp>(loc, int32Type, minusOne, numActiveLanes);
		Value maskAndClamp;
		if (op.mode() == gpu::ShuffleMode::UP) {
		// Clamp lane: `32 - activeWidth`
		maskAndClamp = numActiveLanes;
		} else {
// Clamp lane: `activeWidth - 1`		// Clamp lane: `activeWidth - 1`
Value maskAndClamp =		maskAndClamp =
rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);		rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
		}

auto returnValueAndIsValidAttr = rewriter.getUnitAttr();		auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
Value shfl = rewriter.create<NVVM::ShflOp>(		Value shfl = rewriter.create<NVVM::ShflOp>(
loc, resultTy, activeMask, adaptor.value(), adaptor.offset(),		loc, resultTy, activeMask, adaptor.value(), adaptor.offset(),
maskAndClamp, convertShflKind(op.mode()), returnValueAndIsValidAttr);		maskAndClamp, convertShflKind(op.mode()), returnValueAndIsValidAttr);
Value shflValue = rewriter.create<LLVM::ExtractValueOp>(		Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
loc, valueTy, shfl, rewriter.getIndexArrayAttr(0));		loc, valueTy, shfl, rewriter.getIndexArrayAttr(0));
Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(		Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
▲ Show 20 Lines • Show All 150 Lines • Show Last 20 Lines

mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir

Show First 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	gpu.module @test_module {
builtin.func @gpu_shuffle() -> (f32, f32, f32, f32) {		builtin.func @gpu_shuffle() -> (f32, f32, f32, f32) {
// CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32		// CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
%arg0 = arith.constant 1.0 : f32		%arg0 = arith.constant 1.0 : f32
// CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32		// CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
%arg1 = arith.constant 4 : i32		%arg1 = arith.constant 4 : i32
// CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32		// CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32
%arg2 = arith.constant 23 : i32		%arg2 = arith.constant 23 : i32
// CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32		// CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[#SHL:]] = llvm.shl %[[#ONE]], %[[#WIDTH]] : i32		// CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
// CHECK: %[[#MASK:]] = llvm.sub %[[#SHL]], %[[#ONE]] : i32		// CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
		// CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
		// CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
// CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : i32		// CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : i32
// CHECK: %[[#SHFL:]] = nvvm.shfl.sync bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>		// CHECK: %[[#SHFL:]] = nvvm.shfl.sync bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
// CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm.struct<(f32, i1)>		// CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm.struct<(f32, i1)>
// CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm.struct<(f32, i1)>		// CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm.struct<(f32, i1)>
%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32		%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
// CHECK: nvvm.shfl.sync up {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>		// CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
		// CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
		// CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
		// CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
		// CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
		// CHECK: %[[#SHFL:]] = nvvm.shfl.sync up %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#NUM_LANES]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
		// CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm.struct<(f32, i1)>
		// CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm.struct<(f32, i1)>
%shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32		%shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32
// CHECK: nvvm.shfl.sync down {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>		// CHECK: nvvm.shfl.sync down {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
%shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32		%shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
// CHECK: nvvm.shfl.sync idx {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>		// CHECK: nvvm.shfl.sync idx {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
%shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32		%shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32

std.return %shfl, %shflu, %shfld, %shfli : f32, f32,f32, f32		std.return %shfl, %shflu, %shfld, %shfli : f32, f32,f32, f32
}		}
▲ Show 20 Lines • Show All 337 Lines • Show Last 20 Lines