This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/PowerPC/
-
Target/
-
PowerPC/
-
PPCISelLowering.cpp
1/2
PPCInstrAltivec.td
-
test/CodeGen/PowerPC/
-
CodeGen/
-
PowerPC/
-
vmladduhm.ll

Differential D76751

[PowerPC] Improve the way legalize mul for v8i16 and add pattern to match mul + add
ClosedPublic

Authored by steven.zhang on Mar 24 2020, 8:44 PM.

Download Raw Diff

Details

Reviewers

nemanjai
jsji

Group Reviewers

Restricted Project

Commits

rG1ef7bf412141: [PowerPC] Improve the way legalize mul for v8i16 and add pattern to match mul +…

Summary

We can legalize the operation MUL for v8i16 with instruction (vmladduhm A, B, 0) if altivec enabled. Now, it is set as custom and expand it later, which is not the right way. And then, we can add the pattern to match the mul + add with (vmladduhm A, B, C)

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

steven.zhang created this revision.Mar 24 2020, 8:44 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 24 2020, 8:44 PM

Herald added subscribers: shchenz, • wuzish, kbarton, hiraditya. · View Herald Transcript

Harbormaster completed remote builds in B50352: Diff 252486.Mar 24 2020, 9:52 PM

Thanks for putting this up. This is actually something I meant to do very soon as we need it for a specific benchmark. Please fix the pattern and then this patch is fine.

Demonstration that the pattern is backwards:

$ cat def.c 
vector unsigned short test(vector unsigned short A, vector unsigned short B,
                           vector unsigned short C) {
  return A + B * C;
}

$ cat use.c 
#include <stdio.h>
#include <stdlib.h>
vector unsigned short test(vector unsigned short, vector unsigned short,
                           vector unsigned short);
int main(int argc, const char **argv) {
  unsigned short A = atoi(argv[1]);
  unsigned short B = atoi(argv[2]);
  unsigned short C = atoi(argv[3]);

  vector unsigned short Res =
      test((vector unsigned short)A, (vector unsigned short)B,
           (vector unsigned short)C);
  printf("Res: { %hu, %hu, %hu, %hu, %hu, %hu, %hu, %hu }\n", Res[0], Res[1],
         Res[2], Res[3], Res[4], Res[5], Res[6], Res[7]);
  return 0;
}

$ ./a.out 4 3 2
Res: { 11, 11, 11, 11, 11, 11, 11, 11 }

# correct value:
$ ./correct 4 3 2
Res: { 10, 10, 10, 10, 10, 10, 10, 10 }

llvm/lib/Target/PowerPC/PPCInstrAltivec.td
876	Huh? Is this right? Doesn't the instruction do `$vA * $vB + $vC`? So wouldn't that mean that we need the input pattern to be `(add v8i16:$vC, (mul v8i16:$vA, v8i16:$vB))`? Or the output to be `(VMLADDUHM $vB, $vC, $vA)` if we are leaving the input the same.

This revision now requires changes to proceed.Mar 25 2020, 3:23 AM

steven.zhang marked an inline comment as done.Mar 25 2020, 4:02 AM

steven.zhang added inline comments.

llvm/lib/Target/PowerPC/PPCInstrAltivec.td
876	Oops! You are right.

Fix the pattern issue.

Harbormaster failed remote builds in B50377: Diff 252536!Mar 25 2020, 5:22 AM

LGTM. Thanks.

This revision is now accepted and ready to land.Mar 25 2020, 10:37 AM

Closed by commit rG1ef7bf412141: [PowerPC] Improve the way legalize mul for v8i16 and add pattern to match mul +… (authored by steven.zhang). · Explain WhyMar 25 2020, 10:12 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

PowerPC/

PPCISelLowering.cpp

9 lines

PPCInstrAltivec.td

6 lines

test/

CodeGen/

PowerPC/

vmladduhm.ll

6 lines

Diff 252749

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 758 Lines • ▼ Show 20 Lines	if (TM.Options.UnsafeFPMath \|\| Subtarget.hasVSX()) {
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);		setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
}		}

if (Subtarget.hasP8Altivec())		if (Subtarget.hasP8Altivec())
setOperationAction(ISD::MUL, MVT::v4i32, Legal);		setOperationAction(ISD::MUL, MVT::v4i32, Legal);
else		else
setOperationAction(ISD::MUL, MVT::v4i32, Custom);		setOperationAction(ISD::MUL, MVT::v4i32, Custom);

setOperationAction(ISD::MUL, MVT::v8i16, Custom);		setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);		setOperationAction(ISD::MUL, MVT::v16i8, Custom);

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);		setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);		setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);

setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);		setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);		setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);		setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
▲ Show 20 Lines • Show All 9,673 Lines • ▼ Show 20 Lines	SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
LHS, RHS, DAG, dl, MVT::v4i32);		LHS, RHS, DAG, dl, MVT::v4i32);

SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,		SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);		LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
// Shift the high parts up 16 bits.		// Shift the high parts up 16 bits.
HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,		HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
Neg16, DAG, dl);		Neg16, DAG, dl);
return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);		return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
} else if (Op.getValueType() == MVT::v8i16) {
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);

return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
LHS, RHS, Zero, DAG, dl);
} else if (Op.getValueType() == MVT::v16i8) {		} else if (Op.getValueType() == MVT::v16i8) {
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);		SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
bool isLittleEndian = Subtarget.isLittleEndian();		bool isLittleEndian = Subtarget.isLittleEndian();

// Multiply the even 8-bit parts, producing 16-bit sums.		// Multiply the even 8-bit parts, producing 16-bit sums.
SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,		SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
LHS, RHS, DAG, dl, MVT::v8i16);		LHS, RHS, DAG, dl, MVT::v8i16);
EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);		EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
▲ Show 20 Lines • Show All 5,628 Lines • Show Last 20 Lines

llvm/lib/Target/PowerPC/PPCInstrAltivec.td

	Show First 20 Lines • Show All 863 Lines • ▼ Show 20 Lines
	// Rotates.			// Rotates.
	def : Pat<(v16i8 (rotl v16i8:$vA, v16i8:$vB)),			def : Pat<(v16i8 (rotl v16i8:$vA, v16i8:$vB)),
	(v16i8 (VRLB v16i8:$vA, v16i8:$vB))>;			(v16i8 (VRLB v16i8:$vA, v16i8:$vB))>;
	def : Pat<(v8i16 (rotl v8i16:$vA, v8i16:$vB)),			def : Pat<(v8i16 (rotl v8i16:$vA, v8i16:$vB)),
	(v8i16 (VRLH v8i16:$vA, v8i16:$vB))>;			(v8i16 (VRLH v8i16:$vA, v8i16:$vB))>;
	def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)),			def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)),
	(v4i32 (VRLW v4i32:$vA, v4i32:$vB))>;			(v4i32 (VRLW v4i32:$vA, v4i32:$vB))>;

				// Multiply
				def : Pat<(mul v8i16:$vA, v8i16:$vB), (VMLADDUHM $vA, $vB, (v8i16(V_SET0H)))>;

				// Add
				def : Pat<(add (mul v8i16:$vA, v8i16:$vB), v8i16:$vC), (VMLADDUHM $vA, $vB, $vC)>;
				nemanjaiUnsubmitted Not Done Reply Inline Actions Huh? Is this right? Doesn't the instruction do `$vA * $vB + $vC`? So wouldn't that mean that we need the input pattern to be `(add v8i16:$vC, (mul v8i16:$vA, v8i16:$vB))`? Or the output to be `(VMLADDUHM $vB, $vC, $vA)` if we are leaving the input the same. nemanjai: Huh? Is this right? Doesn't the instruction do `$vA * $vB + $vC`? So wouldn't that mean that we…
				steven.zhangAuthorUnsubmitted Done Reply Inline Actions Oops! You are right. steven.zhang: Oops! You are right.

	// Saturating adds/subtracts.			// Saturating adds/subtracts.
	def : Pat<(v16i8 (saddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDSBS $vA, $vB))>;			def : Pat<(v16i8 (saddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDSBS $vA, $vB))>;
	def : Pat<(v16i8 (uaddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDUBS $vA, $vB))>;			def : Pat<(v16i8 (uaddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDUBS $vA, $vB))>;
	def : Pat<(v8i16 (saddsat v8i16:$vA, v8i16:$vB)), (v8i16 (VADDSHS $vA, $vB))>;			def : Pat<(v8i16 (saddsat v8i16:$vA, v8i16:$vB)), (v8i16 (VADDSHS $vA, $vB))>;
	def : Pat<(v8i16 (uaddsat v8i16:$vA, v8i16:$vB)), (v8i16 (VADDUHS $vA, $vB))>;			def : Pat<(v8i16 (uaddsat v8i16:$vA, v8i16:$vB)), (v8i16 (VADDUHS $vA, $vB))>;
	def : Pat<(v4i32 (saddsat v4i32:$vA, v4i32:$vB)), (v4i32 (VADDSWS $vA, $vB))>;			def : Pat<(v4i32 (saddsat v4i32:$vA, v4i32:$vB)), (v4i32 (VADDSWS $vA, $vB))>;
	def : Pat<(v4i32 (uaddsat v4i32:$vA, v4i32:$vB)), (v4i32 (VADDUWS $vA, $vB))>;			def : Pat<(v4i32 (uaddsat v4i32:$vA, v4i32:$vB)), (v4i32 (VADDUWS $vA, $vB))>;
	def : Pat<(v16i8 (ssubsat v16i8:$vA, v16i8:$vB)), (v16i8 (VSUBSBS $vA, $vB))>;			def : Pat<(v16i8 (ssubsat v16i8:$vA, v16i8:$vB)), (v16i8 (VSUBSBS $vA, $vB))>;
	▲ Show 20 Lines • Show All 722 Lines • Show Last 20 Lines

llvm/test/CodeGen/PowerPC/vmladduhm.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-P9			; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-P9
	; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-P8			; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-P8
	define <8 x i16> @mul(<8 x i16> %m, <8 x i16> %n) {			define <8 x i16> @mul(<8 x i16> %m, <8 x i16> %n) {
	; CHECK-LABEL: mul:			; CHECK-LABEL: mul:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: xxlxor 36, 36, 36			; CHECK-NEXT: vxor 4, 4, 4
	; CHECK-NEXT: vmladduhm 2, 2, 3, 4			; CHECK-NEXT: vmladduhm 2, 2, 3, 4
	; CHECK-NEXT: blr			; CHECK-NEXT: blr
	entry:			entry:
	%0 = mul <8 x i16> %m, %n			%0 = mul <8 x i16> %m, %n
	ret <8 x i16> %0			ret <8 x i16> %0
	}			}

	define <8 x i16> @madd(<8 x i16> %m, <8 x i16> %n, <8 x i16> %o) {			define <8 x i16> @madd(<8 x i16> %m, <8 x i16> %n, <8 x i16> %o) {
	; CHECK-LABEL: madd:			; CHECK-LABEL: madd:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: xxlxor 37, 37, 37			; CHECK-NEXT: vmladduhm 2, 2, 3, 4
	; CHECK-NEXT: vmladduhm 2, 2, 3, 5
	; CHECK-NEXT: vadduhm 2, 2, 4
	; CHECK-NEXT: blr			; CHECK-NEXT: blr
	entry:			entry:
	%0 = mul <8 x i16> %m, %n			%0 = mul <8 x i16> %m, %n
	%1 = add <8 x i16> %0, %o			%1 = add <8 x i16> %0, %o
	ret <8 x i16> %1			ret <8 x i16> %1
	}			}