This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64
ClosedPublic

Authored by RKSimon on Oct 26 2016, 1:55 PM.

Download Raw Diff

Details

Reviewers

delena
igorb

Commits

rG820e1326d726: [X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64
rL285304: [X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64

Summary

With DQI but without VLX, lower v2i64 and v4i64 MUL operations with v8i64 MUL (vpmullq).

Updated cost table accordingly.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 75943.Oct 26 2016, 1:55 PM

RKSimon retitled this revision from to [X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64.

RKSimon updated this object.

RKSimon added reviewers: delena, igorb.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

LGTM

lib/Target/X86/X86ISelLowering.cpp
19858	Hello, It is possible to implement this logic in td file, similar to multiclass avx512_var_shift_w_lowering<..> implementation.

This revision is now accepted and ready to land.Oct 27 2016, 7:01 AM

Closed by commit rL285304: [X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64 (authored by RKSimon). · Explain WhyOct 27 2016, 8:36 AM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in rL285313: [X86][AVX512DQ] Move v2i64 and v4i64 MUL lowering to tablegen.Oct 27 2016, 10:17 AM

Revision Contents

Path

Size

lib/

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 285238)

18 lines

	X86TargetTransformInfo.cpp
	X86TargetTransformInfo.cpp (revision 285238)

13 lines

test/

Analysis/

CostModel/

X86/

	arith.ll
	arith.ll (revision 285238)

17 lines

CodeGen/

X86/

	avx512-arith.ll
	avx512-arith.ll (revision 285238)

26 lines

Diff 75943

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 19,848 Lines • ▼ Show 20 Lines	if (VT == MVT::v4i32) {
// shuffles.		// shuffles.
static const int ShufMask[] = { 0, 4, 2, 6 };		static const int ShufMask[] = { 0, 4, 2, 6 };
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);		return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}		}

assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&		assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");		"Only know how to lower V2I64/V4I64/V8I64 multiply");

		// AVX512DQ - extend to 512 bit vector.
		if (Subtarget.hasDQI()) {
		igorbUnsubmitted Not Done Reply Inline Actions Hello, It is possible to implement this logic in td file, similar to multiclass avx512_var_shift_w_lowering<..> implementation. igorb: Hello, It is possible to implement this logic in td file, similar to multiclass…
		assert(!Subtarget.hasVLX() && "AVX512DQVL vXi64 multiply is legal");
		assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) &&
		"AVX512DQ v8i64 multiply is legal");

		MVT NewVT = MVT::getVectorVT(MVT::i64, 512 / VT.getScalarSizeInBits());
		SDValue A512 =
		DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), A,
		DAG.getIntPtrConstant(0, dl));
		SDValue B512 =
		DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), B,
		DAG.getIntPtrConstant(0, dl));
		SDValue MulNode = DAG.getNode(ISD::MUL, dl, NewVT, A512, B512);
		return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MulNode,
		DAG.getIntPtrConstant(0, dl));
		}

// Ahi = psrlqi(a, 32);		// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);		// Bhi = psrlqi(b, 32);
//		//
// AloBlo = pmuludq(a, b);		// AloBlo = pmuludq(a, b);
// AloBhi = pmuludq(a, Bhi);		// AloBhi = pmuludq(a, Bhi);
// AhiBlo = pmuludq(Ahi, b);		// AhiBlo = pmuludq(Ahi, b);

// AloBhi = psllqi(AloBhi, 32);		// AloBhi = psllqi(AloBhi, 32);
▲ Show 20 Lines • Show All 13,097 Lines • Show Last 20 Lines

lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 198 Lines • ▼ Show 20 Lines	if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())		if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;		return LT.first * 15;

if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,		if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
LT.second))		LT.second))
return LT.first * Entry->Cost;		return LT.first * Entry->Cost;
}		}

		static const CostTblEntry AVX512DQCostTable[] = {
		{ ISD::MUL, MVT::v2i64, 1 },
		{ ISD::MUL, MVT::v4i64, 1 },
		{ ISD::MUL, MVT::v8i64, 1 }
		};

		// Look for AVX512DQ lowering tricks for custom cases.
		if (ST->hasDQI()) {
		if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
		LT.second))
		return LT.first * Entry->Cost;
		}

static const CostTblEntry AVX512BWCostTable[] = {		static const CostTblEntry AVX512BWCostTable[] = {
// Vectorizing division is a bad idea. See the SSE2 table for more comments.		// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v64i8, 64*20 },		{ ISD::SDIV, MVT::v64i8, 64*20 },
{ ISD::SDIV, MVT::v32i16, 32*20 },		{ ISD::SDIV, MVT::v32i16, 32*20 },
{ ISD::SDIV, MVT::v16i32, 16*20 },		{ ISD::SDIV, MVT::v16i32, 16*20 },
{ ISD::SDIV, MVT::v8i64, 8*20 },		{ ISD::SDIV, MVT::v8i64, 8*20 },
{ ISD::UDIV, MVT::v64i8, 64*20 },		{ ISD::UDIV, MVT::v64i8, 64*20 },
{ ISD::UDIV, MVT::v32i16, 32*20 },		{ ISD::UDIV, MVT::v32i16, 32*20 },
▲ Show 20 Lines • Show All 1,588 Lines • Show Last 20 Lines

test/Analysis/CostModel/X86/arith.ll

	; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 \| FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3			; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 \| FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3
	; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE42			; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
	; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX			; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX
	; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX2			; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
	; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F			; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
	; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW			; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
				; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
	target triple = "x86_64-apple-macosx10.8.0"			target triple = "x86_64-apple-macosx10.8.0"

	; CHECK-LABEL: 'add'			; CHECK-LABEL: 'add'
	define i32 @add(i32 %arg) {			define i32 @add(i32 %arg) {
	; SSSE3: cost of 1 {{.*}} %A = add			; SSSE3: cost of 1 {{.*}} %A = add
	; SSE42: cost of 1 {{.*}} %A = add			; SSE42: cost of 1 {{.*}} %A = add
	▲ Show 20 Lines • Show All 409 Lines • ▼ Show 20 Lines
	}			}

	; CHECK-LABEL: 'mul'			; CHECK-LABEL: 'mul'
	define i32 @mul(i32 %arg) {			define i32 @mul(i32 %arg) {
	; SSSE3: cost of 9 {{.*}} %A = mul			; SSSE3: cost of 9 {{.*}} %A = mul
	; SSE42: cost of 9 {{.*}} %A = mul			; SSE42: cost of 9 {{.*}} %A = mul
	; AVX: cost of 9 {{.*}} %A = mul			; AVX: cost of 9 {{.*}} %A = mul
	; AVX2: cost of 9 {{.*}} %A = mul			; AVX2: cost of 9 {{.*}} %A = mul
	; AVX512: cost of 9 {{.*}} %A = mul			; AVX512F: cost of 9 {{.*}} %A = mul
				; AVX512BW: cost of 9 {{.*}} %A = mul
				; AVX512DQ: cost of 1 {{.*}} %A = mul
	%A = mul <2 x i64> undef, undef			%A = mul <2 x i64> undef, undef
	; SSSE3: cost of 18 {{.*}} %B = mul			; SSSE3: cost of 18 {{.*}} %B = mul
	; SSE42: cost of 18 {{.*}} %B = mul			; SSE42: cost of 18 {{.*}} %B = mul
	; AVX: cost of 18 {{.*}} %B = mul			; AVX: cost of 18 {{.*}} %B = mul
	; AVX2: cost of 9 {{.*}} %B = mul			; AVX2: cost of 9 {{.*}} %B = mul
	; AVX512: cost of 9 {{.*}} %B = mul			; AVX512F: cost of 9 {{.*}} %B = mul
				; AVX512BW: cost of 9 {{.*}} %B = mul
				; AVX512DQ: cost of 1 {{.*}} %B = mul
	%B = mul <4 x i64> undef, undef			%B = mul <4 x i64> undef, undef
	; SSSE3: cost of 36 {{.*}} %C = mul			; SSSE3: cost of 36 {{.*}} %C = mul
	; SSE42: cost of 36 {{.*}} %C = mul			; SSE42: cost of 36 {{.*}} %C = mul
	; AVX: cost of 36 {{.*}} %C = mul			; AVX: cost of 36 {{.*}} %C = mul
	; AVX2: cost of 18 {{.*}} %C = mul			; AVX2: cost of 18 {{.*}} %C = mul
	; AVX512: cost of 2 {{.*}} %C = mul			; AVX512F: cost of 2 {{.*}} %C = mul
				; AVX512BW: cost of 2 {{.*}} %C = mul
				; AVX512DQ: cost of 1 {{.*}} %C = mul
	%C = mul <8 x i64> undef, undef			%C = mul <8 x i64> undef, undef

	; SSSE3: cost of 6 {{.*}} %D = mul			; SSSE3: cost of 6 {{.*}} %D = mul
	; SSE42: cost of 1 {{.*}} %D = mul			; SSE42: cost of 1 {{.*}} %D = mul
	; AVX: cost of 1 {{.*}} %D = mul			; AVX: cost of 1 {{.*}} %D = mul
	; AVX2: cost of 1 {{.*}} %D = mul			; AVX2: cost of 1 {{.*}} %D = mul
	; AVX512: cost of 1 {{.*}} %D = mul			; AVX512: cost of 1 {{.*}} %D = mul
	%D = mul <4 x i32> undef, undef			%D = mul <4 x i32> undef, undef
	▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines
	define void @mul_2i32() {			define void @mul_2i32() {
	; A <2 x i32> gets expanded to a <2 x i64> vector.			; A <2 x i32> gets expanded to a <2 x i64> vector.
	; A <2 x i64> vector multiply is implemented using			; A <2 x i64> vector multiply is implemented using
	; 3 PMULUDQ and 2 PADDS and 4 shifts.			; 3 PMULUDQ and 2 PADDS and 4 shifts.
	; SSSE3: cost of 9 {{.*}} %A0 = mul			; SSSE3: cost of 9 {{.*}} %A0 = mul
	; SSE42: cost of 9 {{.*}} %A0 = mul			; SSE42: cost of 9 {{.*}} %A0 = mul
	; AVX: cost of 9 {{.*}} %A0 = mul			; AVX: cost of 9 {{.*}} %A0 = mul
	; AVX2: cost of 9 {{.*}} %A0 = mul			; AVX2: cost of 9 {{.*}} %A0 = mul
	; AVX512: cost of 9 {{.*}} %A0 = mul			; AVX512F: cost of 9 {{.*}} %A0 = mul
				; AVX512BW: cost of 9 {{.*}} %A0 = mul
				; AVX512DQ: cost of 1 {{.*}} %A0 = mul
	%A0 = mul <2 x i32> undef, undef			%A0 = mul <2 x i32> undef, undef

	ret void			ret void
	}			}

test/CodeGen/X86/avx512-arith.ll

	Show First 20 Lines • Show All 176 Lines • ▼ Show 20 Lines
	; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm0			; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
	; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0			; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0
	; AVX512BW-NEXT: vpaddq %ymm0, %ymm3, %ymm0			; AVX512BW-NEXT: vpaddq %ymm0, %ymm3, %ymm0
	; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0			; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	;			;
	; AVX512DQ-LABEL: imulq256:			; AVX512DQ-LABEL: imulq256:
	; AVX512DQ: ## BB#0:			; AVX512DQ: ## BB#0:
	; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm2			; AVX512DQ-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
	; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm3			; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
	; AVX512DQ-NEXT: vpmuludq %ymm3, %ymm1, %ymm3			; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
	; AVX512DQ-NEXT: vpsllq $32, %ymm3, %ymm3			; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
	; AVX512DQ-NEXT: vpsrlq $32, %ymm1, %ymm1
	; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
	; AVX512DQ-NEXT: vpsllq $32, %ymm0, %ymm0
	; AVX512DQ-NEXT: vpaddq %ymm0, %ymm3, %ymm0
	; AVX512DQ-NEXT: vpaddq %ymm0, %ymm2, %ymm0
	; AVX512DQ-NEXT: retq			; AVX512DQ-NEXT: retq
	;			;
	; SKX-LABEL: imulq256:			; SKX-LABEL: imulq256:
	; SKX: ## BB#0:			; SKX: ## BB#0:
	; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0			; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%z = mul <4 x i64>%x, %y			%z = mul <4 x i64>%x, %y
	ret <4 x i64>%z			ret <4 x i64>%z
	Show All 36 Lines
	; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm0			; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
	; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0			; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
	; AVX512BW-NEXT: vpaddq %xmm0, %xmm3, %xmm0			; AVX512BW-NEXT: vpaddq %xmm0, %xmm3, %xmm0
	; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0			; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	;			;
	; AVX512DQ-LABEL: imulq128:			; AVX512DQ-LABEL: imulq128:
	; AVX512DQ: ## BB#0:			; AVX512DQ: ## BB#0:
	; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm2			; AVX512DQ-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
	; AVX512DQ-NEXT: vpsrlq $32, %xmm0, %xmm3			; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
	; AVX512DQ-NEXT: vpmuludq %xmm3, %xmm1, %xmm3			; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
	; AVX512DQ-NEXT: vpsllq $32, %xmm3, %xmm3			; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
	; AVX512DQ-NEXT: vpsrlq $32, %xmm1, %xmm1
	; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
	; AVX512DQ-NEXT: vpsllq $32, %xmm0, %xmm0
	; AVX512DQ-NEXT: vpaddq %xmm0, %xmm3, %xmm0
	; AVX512DQ-NEXT: vpaddq %xmm0, %xmm2, %xmm0
	; AVX512DQ-NEXT: retq			; AVX512DQ-NEXT: retq
	;			;
	; SKX-LABEL: imulq128:			; SKX-LABEL: imulq128:
	; SKX: ## BB#0:			; SKX: ## BB#0:
	; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0			; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%z = mul <2 x i64>%x, %y			%z = mul <2 x i64>%x, %y
	ret <2 x i64>%z			ret <2 x i64>%z
	▲ Show 20 Lines • Show All 846 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 75943

lib/Target/X86/X86ISelLowering.cpp

lib/Target/X86/X86TargetTransformInfo.cpp

test/Analysis/CostModel/X86/arith.ll

test/CodeGen/X86/avx512-arith.ll

[X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64
ClosedPublic