This is an archive of the discontinued LLVM Phabricator instance.

[mlir] [VectorOps] Improve scatter/gather CPU performance
ClosedPublic

Authored by aartbik on Jul 22 2020, 4:52 PM.

Download Raw Diff

Details

Reviewers

nicolasvasilache
ftynse
arpith-jacob
bkramer
reidtatge

Commits

rG1485fd295b2a: [mlir] [VectorOps] Improve scatter/gather CPU performance

Summary

Replaced the linearized address with the proper LLVM way of
defining vector of base + indices in SIMD style. This yields
much better code. Some prototype results with microbencmarking
sparse matrix x vector with 50% sparsity (about 2-3x faster):

LINEARIZED     IMPROVED

GFLOPS sdot saxpy sdot saxpy
16x16 1.6 1.4 4.4 2.1
32x32 1.7 1.6 5.8 5.9
64x64 1.7 1.7 6.4 6.4
128x128 1.7 1.7 5.9 5.9
256x256 1.6 1.6 6.1 6.0
512x512 1.4 1.4 4.9 4.7

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

aartbik created this revision.Jul 22 2020, 4:52 PM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptJul 22 2020, 4:52 PM

Herald added a reviewer: ftynse. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: msifontes, jurahul, Kayjukh and 13 others. · View Herald Transcript

aartbik added reviewers: arpith-jacob, bkramer, reidtatge.Jul 22 2020, 4:53 PM

Harbormaster completed remote builds in B65303: Diff 279968.Jul 22 2020, 5:13 PM

nicolasvasilache accepted this revision.Jul 22 2020, 10:48 PM

This revision is now accepted and ready to land.Jul 22 2020, 10:48 PM

Closed by commit rG1485fd295b2a: [mlir] [VectorOps] Improve scatter/gather CPU performance (authored by aartbik). · Explain WhyJul 22 2020, 11:47 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

mlir/

include/

mlir/

Dialect/

Vector/

VectorOps.td

2 lines

lib/

Conversion/

VectorToLLVM/

ConvertVectorToLLVM.cpp

19 lines

test/

Conversion/

VectorToLLVM/

vector-to-llvm.mlir

6 lines

Diff 280036

mlir/include/mlir/Dialect/Vector/VectorOps.td

Show First 20 Lines • Show All 1,238 Lines • ▼ Show 20 Lines	let description = [{
hardware ISA support for a scatter. The semantics of the operation closely		hardware ISA support for a scatter. The semantics of the operation closely
correspond to those of the `llvm.masked.scatter`		correspond to those of the `llvm.masked.scatter`
[intrinsic](https://llvm.org/docs/LangRef.html#llvm-masked-scatter-intrinsics).		[intrinsic](https://llvm.org/docs/LangRef.html#llvm-masked-scatter-intrinsics).

Example:		Example:

```mlir		```mlir
vector.scatter %base, %indices, %mask, %value		vector.scatter %base, %indices, %mask, %value
: vector<16xi32>, vector<16xi1>, vector<16xf32> into memref<?f32>		: vector<16xi32>, vector<16xi1>, vector<16xf32> into memref<?xf32>
```		```
}];		}];
let extraClassDeclaration = [{		let extraClassDeclaration = [{
MemRefType getMemRefType() {		MemRefType getMemRefType() {
return base().getType().cast<MemRefType>();		return base().getType().cast<MemRefType>();
}		}
VectorType getIndicesVectorType() {		VectorType getIndicesVectorType() {
return indices().getType().cast<VectorType>();		return indices().getType().cast<VectorType>();
▲ Show 20 Lines • Show All 458 Lines • Show Last 20 Lines

mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp

Show First 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	LogicalResult getIndexedPtrs(ConversionPatternRewriter &rewriter,
//		//
int64_t offset;		int64_t offset;
SmallVector<int64_t, 4> strides;		SmallVector<int64_t, 4> strides;
auto successStrides = getStridesAndOffset(memRefType, strides, offset);		auto successStrides = getStridesAndOffset(memRefType, strides, offset);
if (failed(successStrides) \|\| strides.size() != 1 \|\| strides[0] != 1 \|\|		if (failed(successStrides) \|\| strides.size() != 1 \|\| strides[0] != 1 \|\|
offset != 0 \|\| memRefType.getMemorySpace() != 0)		offset != 0 \|\| memRefType.getMemorySpace() != 0)
return failure();		return failure();

// Base pointer.		// Create a vector of pointers from base and indices.
MemRefDescriptor memRefDescriptor(memref);		MemRefDescriptor memRefDescriptor(memref);
Value base = memRefDescriptor.alignedPtr(rewriter, loc);		Value base = memRefDescriptor.alignedPtr(rewriter, loc);

// Create a vector of pointers from base and indices.
//
// TODO: this step serializes the address computations unfortunately,
// ideally we would like to add splat(base) + index_vector
// in SIMD form, but this does not match well with current
// constraints of the standard and vector dialect....
//
int64_t size = vType.getDimSize(0);		int64_t size = vType.getDimSize(0);
auto pType = memRefDescriptor.getElementType();		auto pType = memRefDescriptor.getElementType();
auto ptrsType = LLVM::LLVMType::getVectorTy(pType, size);		auto ptrsType = LLVM::LLVMType::getVectorTy(pType, size);
auto idxType = typeConverter.convertType(iType);		ptrs = rewriter.create<LLVM::GEPOp>(loc, ptrsType, base, indices);
ptrs = rewriter.create<LLVM::UndefOp>(loc, ptrsType);
for (int64_t i = 0; i < size; i++) {
Value off =
extractOne(rewriter, typeConverter, loc, indices, idxType, 1, i);
Value ptr = rewriter.create<LLVM::GEPOp>(loc, pType, base, off);
ptrs = insertOne(rewriter, typeConverter, loc, ptrs, ptr, ptrsType, 1, i);
}
return success();		return success();
}		}

static LogicalResult		static LogicalResult
replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,		replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,
LLVMTypeConverter &typeConverter, Location loc,		LLVMTypeConverter &typeConverter, Location loc,
TransferReadOp xferOp,		TransferReadOp xferOp,
ArrayRef<Value> operands, Value dataPtr) {		ArrayRef<Value> operands, Value dataPtr) {
▲ Show 20 Lines • Show All 1,160 Lines • Show Last 20 Lines

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

	Show First 20 Lines • Show All 970 Lines • ▼ Show 20 Lines
	// CHECK: llvm.return %[[T]] : !llvm<"<16 x float>">			// CHECK: llvm.return %[[T]] : !llvm<"<16 x float>">

	func @gather_op(%arg0: memref<?xf32>, %arg1: vector<3xi32>, %arg2: vector<3xi1>, %arg3: vector<3xf32>) -> vector<3xf32> {			func @gather_op(%arg0: memref<?xf32>, %arg1: vector<3xi32>, %arg2: vector<3xi1>, %arg3: vector<3xf32>) -> vector<3xf32> {
	%0 = vector.gather %arg0, %arg1, %arg2, %arg3 : (memref<?xf32>, vector<3xi32>, vector<3xi1>, vector<3xf32>) -> vector<3xf32>			%0 = vector.gather %arg0, %arg1, %arg2, %arg3 : (memref<?xf32>, vector<3xi32>, vector<3xi1>, vector<3xf32>) -> vector<3xf32>
	return %0 : vector<3xf32>			return %0 : vector<3xf32>
	}			}

	// CHECK-LABEL: func @gather_op			// CHECK-LABEL: func @gather_op
	// CHECK: %[[G:.]] = llvm.intr.masked.gather %{{.}}, %{{.}}, %{{.}} {alignment = 4 : i32} : (!llvm<"<3 x float*>">, !llvm<"<3 x i1>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">			// CHECK: %[[P:.]] = llvm.getelementptr {{.}}[%{{.}}] : (!llvm<"float">, !llvm<"<3 x i32>">) -> !llvm<"<3 x float*>">
				// CHECK: %[[G:.]] = llvm.intr.masked.gather %[[P]], %{{.}}, %{{.}} {alignment = 4 : i32} : (!llvm<"<3 x float>">, !llvm<"<3 x i1>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">
	// CHECK: llvm.return %[[G]] : !llvm<"<3 x float>">			// CHECK: llvm.return %[[G]] : !llvm<"<3 x float>">

	func @scatter_op(%arg0: memref<?xf32>, %arg1: vector<3xi32>, %arg2: vector<3xi1>, %arg3: vector<3xf32>) {			func @scatter_op(%arg0: memref<?xf32>, %arg1: vector<3xi32>, %arg2: vector<3xi1>, %arg3: vector<3xf32>) {
	vector.scatter %arg0, %arg1, %arg2, %arg3 : vector<3xi32>, vector<3xi1>, vector<3xf32> into memref<?xf32>			vector.scatter %arg0, %arg1, %arg2, %arg3 : vector<3xi32>, vector<3xi1>, vector<3xf32> into memref<?xf32>
	return			return
	}			}

	// CHECK-LABEL: func @scatter_op			// CHECK-LABEL: func @scatter_op
	// CHECK: llvm.intr.masked.scatter %{{.}}, %{{.}}, %{{.}} {alignment = 4 : i32} : !llvm<"<3 x float>">, !llvm<"<3 x i1>"> into !llvm<"<3 x float>">			// CHECK: %[[P:.]] = llvm.getelementptr {{.}}[%{{.}}] : (!llvm<"float">, !llvm<"<3 x i32>">) -> !llvm<"<3 x float*>">
				// CHECK: llvm.intr.masked.scatter %{{.}}, %[[P]], %{{.}} {alignment = 4 : i32} : !llvm<"<3 x float>">, !llvm<"<3 x i1>"> into !llvm<"<3 x float*>">
	// CHECK: llvm.return			// CHECK: llvm.return