This is an archive of the discontinued LLVM Phabricator instance.

[mlir][VectorOps] Lower vector.fma to llvm.fmuladd instead of llvm.fma
ClosedPublic

Authored by bkramer on Jul 13 2020, 3:26 AM.

Download Raw Diff

Details

Reviewers

nicolasvasilache
aartbik
ftynse

Commits

rG3bffe6022cc9: [mlir][VectorOps] Lower vector.fma to llvm.fmuladd instead of llvm.fma

Summary

These are semantically equivalent, but fmuladd allows decaying the op
into fmul+fadd if there is no fma instruction available. llvm.fma lowers
to scalar calls to libm fmaf, which is a lot slower.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

bkramer created this revision.Jul 13 2020, 3:26 AM

Herald added a reviewer: ftynse. · View Herald TranscriptJul 13 2020, 3:26 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: msifontes, jurahul, Kayjukh and 12 others. · View Herald Transcript

Great, is this the root cause of the issue we were seeing on older HW ?

This revision is now accepted and ready to land.Jul 13 2020, 3:30 AM

In D83666#2146940, @nicolasvasilache wrote:

Great, is this the root cause of the issue we were seeing on older HW ?

Yup, this change makes the generated matmul 8x faster when targeting SSE with no fma instructions.

Closed by commit rG3bffe6022cc9: [mlir][VectorOps] Lower vector.fma to llvm.fmuladd instead of llvm.fma (authored by bkramer). · Explain WhyJul 13 2020, 3:37 AM

This revision was automatically updated to reflect the committed changes.

Harbormaster completed remote builds in B63934: Diff 277372.Jul 13 2020, 3:42 AM

LGTM, nice finding for SSE!

note that we guarantee use of llvm.fma in some of the vector doc, we probably want to update that as well

Revision Contents

Path

Size

mlir/

lib/

Conversion/

VectorToLLVM/

ConvertVectorToLLVM.cpp

6 lines

test/

Conversion/

VectorToLLVM/

vector-to-llvm.mlir

10 lines

Diff 277373

mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp

	Show First 20 Lines • Show All 475 Lines • ▼ Show 20 Lines
	/// This does not match vectors of n >= 2 rank.			/// This does not match vectors of n >= 2 rank.
	///			///
	/// Example:			/// Example:
	/// ```			/// ```
	/// vector.fma %a, %a, %a : vector<8xf32>			/// vector.fma %a, %a, %a : vector<8xf32>
	/// ```			/// ```
	/// is converted to:			/// is converted to:
	/// ```			/// ```
	/// llvm.intr.fma %va, %va, %va:			/// llvm.intr.fmuladd %va, %va, %va:
	/// (!llvm<"<8 x float>">, !llvm<"<8 x float>">, !llvm<"<8 x float>">)			/// (!llvm<"<8 x float>">, !llvm<"<8 x float>">, !llvm<"<8 x float>">)
	/// -> !llvm<"<8 x float>">			/// -> !llvm<"<8 x float>">
	/// ```			/// ```
	class VectorFMAOp1DConversion : public ConvertToLLVMPattern {			class VectorFMAOp1DConversion : public ConvertToLLVMPattern {
	public:			public:
	explicit VectorFMAOp1DConversion(MLIRContext *context,			explicit VectorFMAOp1DConversion(MLIRContext *context,
	LLVMTypeConverter &typeConverter)			LLVMTypeConverter &typeConverter)
	: ConvertToLLVMPattern(vector::FMAOp::getOperationName(), context,			: ConvertToLLVMPattern(vector::FMAOp::getOperationName(), context,
	typeConverter) {}			typeConverter) {}

	LogicalResult			LogicalResult
	matchAndRewrite(Operation *op, ArrayRef<Value> operands,			matchAndRewrite(Operation *op, ArrayRef<Value> operands,
	ConversionPatternRewriter &rewriter) const override {			ConversionPatternRewriter &rewriter) const override {
	auto adaptor = vector::FMAOpAdaptor(operands);			auto adaptor = vector::FMAOpAdaptor(operands);
	vector::FMAOp fmaOp = cast<vector::FMAOp>(op);			vector::FMAOp fmaOp = cast<vector::FMAOp>(op);
	VectorType vType = fmaOp.getVectorType();			VectorType vType = fmaOp.getVectorType();
	if (vType.getRank() != 1)			if (vType.getRank() != 1)
	return failure();			return failure();
	rewriter.replaceOpWithNewOp<LLVM::FMAOp>(op, adaptor.lhs(), adaptor.rhs(),			rewriter.replaceOpWithNewOp<LLVM::FMulAddOp>(op, adaptor.lhs(),
	adaptor.acc());			adaptor.rhs(), adaptor.acc());
	return success();			return success();
	}			}
	};			};

	class VectorInsertElementOpConversion : public ConvertToLLVMPattern {			class VectorInsertElementOpConversion : public ConvertToLLVMPattern {
	public:			public:
	explicit VectorInsertElementOpConversion(MLIRContext *context,			explicit VectorInsertElementOpConversion(MLIRContext *context,
	LLVMTypeConverter &typeConverter)			LLVMTypeConverter &typeConverter)
	▲ Show 20 Lines • Show All 707 Lines • Show Last 20 Lines

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

	Show First 20 Lines • Show All 230 Lines • ▼ Show 20 Lines
	// CHECK: %[[T0:.*]] = llvm.mlir.constant(dense<0.000000e+00> : vector<2x3xf32>)			// CHECK: %[[T0:.*]] = llvm.mlir.constant(dense<0.000000e+00> : vector<2x3xf32>)
	// CHECK: %[[T1:.*]] = llvm.mlir.constant(0 : i64) : !llvm.i64			// CHECK: %[[T1:.*]] = llvm.mlir.constant(0 : i64) : !llvm.i64
	// CHECK: %[[T2:.*]] = llvm.extractelement %[[A]][%[[T1]] : !llvm.i64] : !llvm<"<2 x float>">			// CHECK: %[[T2:.*]] = llvm.extractelement %[[A]][%[[T1]] : !llvm.i64] : !llvm<"<2 x float>">
	// CHECK: %[[T3:.*]] = llvm.mlir.undef : !llvm<"<3 x float>">			// CHECK: %[[T3:.*]] = llvm.mlir.undef : !llvm<"<3 x float>">
	// CHECK: %[[T4:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32			// CHECK: %[[T4:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
	// CHECK: %[[T5:.*]] = llvm.insertelement %[[T2]], %[[T3]][%[[T4]] : !llvm.i32] : !llvm<"<3 x float>">			// CHECK: %[[T5:.*]] = llvm.insertelement %[[T2]], %[[T3]][%[[T4]] : !llvm.i32] : !llvm<"<3 x float>">
	// CHECK: %[[T6:.*]] = llvm.shufflevector %[[T5]], %[[T3]] [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">			// CHECK: %[[T6:.*]] = llvm.shufflevector %[[T5]], %[[T3]] [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
	// CHECK: %[[T7:.*]] = llvm.extractvalue %[[C]][0] : !llvm<"[2 x <3 x float>]">			// CHECK: %[[T7:.*]] = llvm.extractvalue %[[C]][0] : !llvm<"[2 x <3 x float>]">
	// CHECK: %[[T8:.*]] = "llvm.intr.fma"(%[[T6]], %[[B]], %[[T7]]) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">)			// CHECK: %[[T8:.*]] = "llvm.intr.fmuladd"(%[[T6]], %[[B]], %[[T7]]) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">)
	// CHECK: %[[T9:.*]] = llvm.insertvalue %[[T8]], %[[T0]][0] : !llvm<"[2 x <3 x float>]">			// CHECK: %[[T9:.*]] = llvm.insertvalue %[[T8]], %[[T0]][0] : !llvm<"[2 x <3 x float>]">
	// CHECK: %[[T10:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64			// CHECK: %[[T10:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
	// CHECK: %[[T11:.*]] = llvm.extractelement %[[A]][%[[T10]] : !llvm.i64] : !llvm<"<2 x float>">			// CHECK: %[[T11:.*]] = llvm.extractelement %[[A]][%[[T10]] : !llvm.i64] : !llvm<"<2 x float>">
	// CHECK: %[[T12:.*]] = llvm.mlir.undef : !llvm<"<3 x float>">			// CHECK: %[[T12:.*]] = llvm.mlir.undef : !llvm<"<3 x float>">
	// CHECK: %[[T13:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32			// CHECK: %[[T13:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
	// CHECK: %[[T14:.*]] = llvm.insertelement %[[T11]], %[[T12]][%[[T13]] : !llvm.i32] : !llvm<"<3 x float>">			// CHECK: %[[T14:.*]] = llvm.insertelement %[[T11]], %[[T12]][%[[T13]] : !llvm.i32] : !llvm<"<3 x float>">
	// CHECK: %[[T15:.*]] = llvm.shufflevector %[[T14]], %[[T12]] [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">			// CHECK: %[[T15:.*]] = llvm.shufflevector %[[T14]], %[[T12]] [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
	// CHECK: %[[T16:.*]] = llvm.extractvalue %[[C]][1] : !llvm<"[2 x <3 x float>]">			// CHECK: %[[T16:.*]] = llvm.extractvalue %[[C]][1] : !llvm<"[2 x <3 x float>]">
	// CHECK: %[[T17:.*]] = "llvm.intr.fma"(%[[T15]], %[[B]], %[[T16]]) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">)			// CHECK: %[[T17:.*]] = "llvm.intr.fmuladd"(%[[T15]], %[[B]], %[[T16]]) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">)
	// CHECK: %[[T18:.*]] = llvm.insertvalue %[[T17]], %[[T9]][1] : !llvm<"[2 x <3 x float>]">			// CHECK: %[[T18:.*]] = llvm.insertvalue %[[T17]], %[[T9]][1] : !llvm<"[2 x <3 x float>]">
	// CHECK: llvm.return %[[T18]] : !llvm<"[2 x <3 x float>]">			// CHECK: llvm.return %[[T18]] : !llvm<"[2 x <3 x float>]">

	func @shuffle_1D_direct(%arg0: vector<2xf32>, %arg1: vector<2xf32>) -> vector<2xf32> {			func @shuffle_1D_direct(%arg0: vector<2xf32>, %arg1: vector<2xf32>) -> vector<2xf32> {
	%1 = vector.shuffle %arg0, %arg1 [0, 1] : vector<2xf32>, vector<2xf32>			%1 = vector.shuffle %arg0, %arg1 [0, 1] : vector<2xf32>, vector<2xf32>
	return %1 : vector<2xf32>			return %1 : vector<2xf32>
	}			}
	// CHECK-LABEL: llvm.func @shuffle_1D_direct(			// CHECK-LABEL: llvm.func @shuffle_1D_direct(
	▲ Show 20 Lines • Show All 426 Lines • ▼ Show 20 Lines
	// CHECK: %[[s7:.*]] = llvm.insertelement %[[s5]], %[[s3]][%[[s6]] : !llvm.i64] : !llvm<"<1 x float>">			// CHECK: %[[s7:.*]] = llvm.insertelement %[[s5]], %[[s3]][%[[s6]] : !llvm.i64] : !llvm<"<1 x float>">
	// CHECK: %[[s8:.*]] = llvm.insertvalue %[[s7]], %[[s0]][0] : !llvm<"[1 x <1 x float>]">			// CHECK: %[[s8:.*]] = llvm.insertvalue %[[s7]], %[[s0]][0] : !llvm<"[1 x <1 x float>]">
	// CHECK: llvm.return %[[s8]] : !llvm<"[1 x <1 x float>]">			// CHECK: llvm.return %[[s8]] : !llvm<"[1 x <1 x float>]">

	// CHECK-LABEL: llvm.func @vector_fma(			// CHECK-LABEL: llvm.func @vector_fma(
	// CHECK-SAME: %[[A:.]]: !llvm<"<8 x float>">, %[[B:.]]: !llvm<"[2 x <4 x float>]">)			// CHECK-SAME: %[[A:.]]: !llvm<"<8 x float>">, %[[B:.]]: !llvm<"[2 x <4 x float>]">)
	// CHECK-SAME: -> !llvm<"{ <8 x float>, [2 x <4 x float>] }"> {			// CHECK-SAME: -> !llvm<"{ <8 x float>, [2 x <4 x float>] }"> {
	func @vector_fma(%a: vector<8xf32>, %b: vector<2x4xf32>) -> (vector<8xf32>, vector<2x4xf32>) {			func @vector_fma(%a: vector<8xf32>, %b: vector<2x4xf32>) -> (vector<8xf32>, vector<2x4xf32>) {
	// CHECK: "llvm.intr.fma"(%[[A]], %[[A]], %[[A]]) :			// CHECK: "llvm.intr.fmuladd"(%[[A]], %[[A]], %[[A]]) :
	// CHECK-SAME: (!llvm<"<8 x float>">, !llvm<"<8 x float>">, !llvm<"<8 x float>">) -> !llvm<"<8 x float>">			// CHECK-SAME: (!llvm<"<8 x float>">, !llvm<"<8 x float>">, !llvm<"<8 x float>">) -> !llvm<"<8 x float>">
	%0 = vector.fma %a, %a, %a : vector<8xf32>			%0 = vector.fma %a, %a, %a : vector<8xf32>

	// CHECK: %[[b00:.*]] = llvm.extractvalue %[[B]][0] : !llvm<"[2 x <4 x float>]">			// CHECK: %[[b00:.*]] = llvm.extractvalue %[[B]][0] : !llvm<"[2 x <4 x float>]">
	// CHECK: %[[b01:.*]] = llvm.extractvalue %[[B]][0] : !llvm<"[2 x <4 x float>]">			// CHECK: %[[b01:.*]] = llvm.extractvalue %[[B]][0] : !llvm<"[2 x <4 x float>]">
	// CHECK: %[[b02:.*]] = llvm.extractvalue %[[B]][0] : !llvm<"[2 x <4 x float>]">			// CHECK: %[[b02:.*]] = llvm.extractvalue %[[B]][0] : !llvm<"[2 x <4 x float>]">
	// CHECK: %[[B0:.*]] = "llvm.intr.fma"(%[[b00]], %[[b01]], %[[b02]]) :			// CHECK: %[[B0:.*]] = "llvm.intr.fmuladd"(%[[b00]], %[[b01]], %[[b02]]) :
	// CHECK-SAME: (!llvm<"<4 x float>">, !llvm<"<4 x float>">, !llvm<"<4 x float>">) -> !llvm<"<4 x float>">			// CHECK-SAME: (!llvm<"<4 x float>">, !llvm<"<4 x float>">, !llvm<"<4 x float>">) -> !llvm<"<4 x float>">
	// CHECK: llvm.insertvalue %[[B0]], {{.*}}[0] : !llvm<"[2 x <4 x float>]">			// CHECK: llvm.insertvalue %[[B0]], {{.*}}[0] : !llvm<"[2 x <4 x float>]">
	// CHECK: %[[b10:.*]] = llvm.extractvalue %[[B]][1] : !llvm<"[2 x <4 x float>]">			// CHECK: %[[b10:.*]] = llvm.extractvalue %[[B]][1] : !llvm<"[2 x <4 x float>]">
	// CHECK: %[[b11:.*]] = llvm.extractvalue %[[B]][1] : !llvm<"[2 x <4 x float>]">			// CHECK: %[[b11:.*]] = llvm.extractvalue %[[B]][1] : !llvm<"[2 x <4 x float>]">
	// CHECK: %[[b12:.*]] = llvm.extractvalue %[[B]][1] : !llvm<"[2 x <4 x float>]">			// CHECK: %[[b12:.*]] = llvm.extractvalue %[[B]][1] : !llvm<"[2 x <4 x float>]">
	// CHECK: %[[B1:.*]] = "llvm.intr.fma"(%[[b10]], %[[b11]], %[[b12]]) :			// CHECK: %[[B1:.*]] = "llvm.intr.fmuladd"(%[[b10]], %[[b11]], %[[b12]]) :
	// CHECK-SAME: (!llvm<"<4 x float>">, !llvm<"<4 x float>">, !llvm<"<4 x float>">) -> !llvm<"<4 x float>">			// CHECK-SAME: (!llvm<"<4 x float>">, !llvm<"<4 x float>">, !llvm<"<4 x float>">) -> !llvm<"<4 x float>">
	// CHECK: llvm.insertvalue %[[B1]], {{.*}}[1] : !llvm<"[2 x <4 x float>]">			// CHECK: llvm.insertvalue %[[B1]], {{.*}}[1] : !llvm<"[2 x <4 x float>]">
	%1 = vector.fma %b, %b, %b : vector<2x4xf32>			%1 = vector.fma %b, %b, %b : vector<2x4xf32>

	return %0, %1: vector<8xf32>, vector<2x4xf32>			return %0, %1: vector<8xf32>, vector<2x4xf32>
	}			}

	func @reduce_f32(%arg0: vector<16xf32>) -> f32 {			func @reduce_f32(%arg0: vector<16xf32>) -> f32 {
	▲ Show 20 Lines • Show All 259 Lines • Show Last 20 Lines