This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/PowerPC/
-
Target/
-
PowerPC/
-
PPCInstrAltivec.td
-
test/CodeGen/PowerPC/
-
CodeGen/
-
PowerPC/
-
optimize-vector.ll

Differential D154447

[PowerPC] Improve code gen for vector add
ClosedPublic

Authored by lei on Jul 4 2023, 7:51 AM.

Download Raw Diff

Details

Reviewers

nemanjai
stefanp
amyk
power-llvm-team

Commits

rG329b8cd3e382: [PowerPC] Improve code gen for vector add

Summary

Improve codegen for vectors modulo additions.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

lei created this revision.Jul 4 2023, 7:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 4 2023, 7:51 AM

Herald added subscribers: shchenz, hiraditya. · View Herald Transcript

lei requested review of this revision.Jul 4 2023, 7:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 4 2023, 7:51 AM

Patch by Nemanja

Harbormaster completed remote builds in B243038: Diff 537104.Jul 4 2023, 8:57 AM

I think that this patch is fine but I think there may be something missing in terms of vaddudm . Currently a test case like this one:

define dso_local <2 x i64> @x2d(<2 x i64> noundef %x) {
entry:
  %add = shl <2 x i64> %x, <i64 1, i64 1>
  ret <2 x i64> %add
}

Produces some fairly inefficient code:

x2d:                                    # @x2d
.Lfunc_begin3:
        .cfi_startproc
.Lfunc_gep3:
        addis 2, 12, .TOC.-.Lfunc_gep3@ha
        addi 2, 2, .TOC.-.Lfunc_gep3@l
.Lfunc_lep3:
        .localentry     x2d, .Lfunc_lep3-.Lfunc_gep3
# %bb.0:                                # %entry
        addis 3, 2, .LCPI3_0@toc@ha
        addi 3, 3, .LCPI3_0@toc@l
        lxv 35, 0(3)
        vsld 2, 2, 3
        blr

We even do a TOC access.

Unfortunately, this isn't just a case of adding:

def : Pat<(v2i64 (shl v2i64:$vA, (v2i64 (immEQOneV)))),
          (v2i64 (VADDUDM $vA, $vA))>;

like the others but I think it may be worth doing.

At this point maybe just add the test case and we can deal with the issue at a later date.

In D154447#4472054, @stefanp wrote:
Unfortunately, this isn't just a case of adding:
def : Pat<(v2i64 (shl v2i64:$vA, (v2i64 (immEQOneV)))),
          (v2i64 (VADDUDM $vA, $vA))>;
like the others but I think it may be worth doing.

Yeah, this is because we don't have a way of materializing the <1, 1> vector so we end up with a constant pool load. We can provide custom legalization:
setOperationAction(ISD::SHL, MVT::v2i64, Custom); for Power8 and up where we would just leave the node alone if it's a shift by 1.

At this point maybe just add the test case and we can deal with the issue at a later date.

I agree this can be done in a follow-up patch.

add new tc and update to test for pwr8 since we are more interested
in the new tc behaviour for pwr8 and orig tc is same for pwr8 vs pwr7.

Harbormaster completed remote builds in B244857: Diff 539655.Jul 12 2023, 2:57 PM

LGTM.

This revision is now accepted and ready to land.Jul 13 2023, 8:08 AM

Closed by commit rG329b8cd3e382: [PowerPC] Improve code gen for vector add (authored by nemanjai, committed by lei). · Explain WhyJul 13 2023, 12:22 PM

This revision was automatically updated to reflect the committed changes.

lei added a commit: rG329b8cd3e382: [PowerPC] Improve code gen for vector add.

Revision Contents

Path

Size

llvm/

lib/

Target/

PowerPC/

PPCInstrAltivec.td

7 lines

test/

CodeGen/

PowerPC/

optimize-vector.ll

24 lines

Diff 540156

llvm/lib/Target/PowerPC/PPCInstrAltivec.td

Show First 20 Lines • Show All 1,155 Lines • ▼ Show 20 Lines	def : Pat<(v16i8 (sra (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))),
(v16i8 (immEQOneV)))), (v16i8 (VAVGSB $vA, $vB))>;		(v16i8 (immEQOneV)))), (v16i8 (VAVGSB $vA, $vB))>;
def : Pat<(v4i32 (srl (sub v4i32:$vA, (vnot v4i32:$vB)),		def : Pat<(v4i32 (srl (sub v4i32:$vA, (vnot v4i32:$vB)),
(v4i32 (immEQOneV)))), (v4i32 (VAVGUW $vA, $vB))>;		(v4i32 (immEQOneV)))), (v4i32 (VAVGUW $vA, $vB))>;
def : Pat<(v8i16 (srl (sub v8i16:$vA, (v8i16 (bitconvert(vnot v4i32:$vB)))),		def : Pat<(v8i16 (srl (sub v8i16:$vA, (v8i16 (bitconvert(vnot v4i32:$vB)))),
(v8i16 (immEQOneV)))), (v8i16 (VAVGUH $vA, $vB))>;		(v8i16 (immEQOneV)))), (v8i16 (VAVGUH $vA, $vB))>;
def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))),		def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))),
(v16i8 (immEQOneV)))), (v16i8 (VAVGUB $vA, $vB))>;		(v16i8 (immEQOneV)))), (v16i8 (VAVGUB $vA, $vB))>;

		def : Pat<(v16i8 (shl v16i8:$vA, (v16i8 (immEQOneV)))),
		(v16i8 (VADDUBM $vA, $vA))>;
		def : Pat<(v8i16 (shl v8i16:$vA, (v8i16 (immEQOneV)))),
		(v8i16 (VADDUHM $vA, $vA))>;
		def : Pat<(v4i32 (shl v4i32:$vA, (v4i32 (immEQOneV)))),
		(v4i32 (VADDUWM $vA, $vA))>;

} // end HasAltivec		} // end HasAltivec

// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.		// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>		class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
: VX_RD5_RSp5_PS1_XO9<xo,		: VX_RD5_RSp5_PS1_XO9<xo,
(outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, u1imm:$PS),		(outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, u1imm:$PS),
!strconcat(opc, " $VD, $VA, $VB, $PS"), IIC_VecFP, pattern> {		!strconcat(opc, " $VD, $VA, $VB, $PS"), IIC_VecFP, pattern> {
let Defs = [CR6];		let Defs = [CR6];
▲ Show 20 Lines • Show All 475 Lines • Show Last 20 Lines

llvm/test/CodeGen/PowerPC/optimize-vector.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
	; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \			; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
	; RUN: -mcpu=pwr7 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s \| \			; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s \| \
	; RUN: FileCheck %s			; RUN: FileCheck %s

	define dso_local <16 x i8> @x2(<16 x i8> noundef %x) {			define dso_local <16 x i8> @x2(<16 x i8> noundef %x) {
	; CHECK-LABEL: x2:			; CHECK-LABEL: x2:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vspltisb v3, 1			; CHECK-NEXT: vaddubm v2, v2, v2
	; CHECK-NEXT: vslb v2, v2, v3
	; CHECK-NEXT: blr			; CHECK-NEXT: blr
	entry:			entry:
	%add = shl <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>			%add = shl <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
	ret <16 x i8> %add			ret <16 x i8> %add
	}			}

	define dso_local <8 x i16> @x2h(<8 x i16> noundef %x) {			define dso_local <8 x i16> @x2h(<8 x i16> noundef %x) {
	; CHECK-LABEL: x2h:			; CHECK-LABEL: x2h:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vspltish v3, 1			; CHECK-NEXT: vadduhm v2, v2, v2
	; CHECK-NEXT: vslh v2, v2, v3
	; CHECK-NEXT: blr			; CHECK-NEXT: blr
	entry:			entry:
	%add = shl <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>			%add = shl <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
	ret <8 x i16> %add			ret <8 x i16> %add
	}			}

	define dso_local <4 x i32> @x2w(<4 x i32> noundef %x) {			define dso_local <4 x i32> @x2w(<4 x i32> noundef %x) {
	; CHECK-LABEL: x2w:			; CHECK-LABEL: x2w:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vspltisw v3, 1			; CHECK-NEXT: vadduwm v2, v2, v2
	; CHECK-NEXT: vslw v2, v2, v3
	; CHECK-NEXT: blr			; CHECK-NEXT: blr
	entry:			entry:
	%add = shl <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>			%add = shl <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
	ret <4 x i32> %add			ret <4 x i32> %add
	}			}

				define dso_local <2 x i64> @x2d(<2 x i64> noundef %x) {
				; CHECK-LABEL: x2d:
				; CHECK: # %bb.0: # %entry
				; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha
				; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l
				; CHECK-NEXT: lxvd2x v3, 0, r3
				; CHECK-NEXT: vsld v2, v2, v3
				; CHECK-NEXT: blr
				entry:
				%add = shl <2 x i64> %x, <i64 1, i64 1>
				ret <2 x i64> %add
				}