This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
1301	Shouldn't you be searching for any VCMP opcode? RDA would be a nicer way of finding the VPR def, but that shouldn't be unnecessary anyway - I'm pretty certain the VCMP should be the 'Divergent' instruction.
llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
2	Probably best not to run at -O3, just in case upstream/downstream have different optimisation pipelines.

Remove -O3 from test and improve VCMP detection.

Clean up the formatting a little.

samtebbs marked 2 inline comments as done.Sep 15 2020, 6:36 AM

samtebbs added inline comments.

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
1301	Checking the Divergent is much better, thanks. I also realised that I wasn't decrementing `I` if the `++I == E` check failed so this new logic is more robust.

samtebbs marked an inline comment as done.Sep 15 2020, 9:00 AM

LGTM, thanks.

This revision is now accepted and ready to land.Sep 15 2020, 11:51 PM

Closed by rGef0b9f3307a1

Tests updated in rGac2717bfdd0d

dmgreen added inline comments.Sep 16 2020, 10:13 AM

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
1326	The VPT block pass has some very similar code. Do we need to check that Operand 1 and Operand 2 have not been modified between the VCMP and where we are materializing the VPT to?

samtebbs marked an inline comment as done.Sep 21 2020, 7:27 AM

samtebbs added inline comments.

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
1326	That is a good spot. I'll submit a check for this in a follow-up.

samtebbs mentioned this in D88022: [ARM][LowOverheadLoops] Check VCMP operands have same def as the VPT before combining.Sep 21 2020, 7:28 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

ARM/

ARMLowOverheadLoops.cpp

39 lines

test/

CodeGen/

Thumb2/

LowOverheadLoops/

vcmp-vpst-combination.ll

49 lines

Diff 291890

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Show First 20 Lines • Show All 1,292 Lines • ▼ Show 20 Lines	if (Block.HasNonUniformPredicate()) {
// - Insert a new vpst to predicate the instruction(s) that following		// - Insert a new vpst to predicate the instruction(s) that following
// the divergent vpr def.		// the divergent vpr def.
// TODO: We could be producing more VPT blocks than necessary and could		// TODO: We could be producing more VPT blocks than necessary and could
// fold the newly created one into a proceeding one.		// fold the newly created one into a proceeding one.
for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),		for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)		E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
RemovePredicate(&*I);		RemovePredicate(&*I);

		// Check if the instruction defining vpr is a vcmp so it can be combined
		samparkerUnsubmitted Done Reply Inline Actions Shouldn't you be searching for any VCMP opcode? RDA would be a nicer way of finding the VPR def, but that shouldn't be unnecessary anyway - I'm pretty certain the VCMP should be the 'Divergent' instruction. samparker: Shouldn't you be searching for any VCMP opcode? RDA would be a nicer way of finding the VPR def…
		samtebbsAuthorUnsubmitted Done Reply Inline Actions Checking the Divergent is much better, thanks. I also realised that I wasn't decrementing `I` if the `++I == E` check failed so this new logic is more robust. samtebbs: Checking the Divergent is much better, thanks. I also realised that I wasn't decrementing `I`…
		// with the VPST This should be the divergent instruction
		MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->MI->getOpcode()) != 0
		? Divergent->MI
		: nullptr;

unsigned Size = 0;		unsigned Size = 0;
auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);		auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);		auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
MachineInstr *InsertAt = nullptr;		MachineInstr *InsertAt = nullptr;
while (I != E) {		while (I != E) {
InsertAt = &*I;		InsertAt = &*I;
++Size;		++Size;
++I;		++I;
}		}
// Create a VPST (with a null mask for now, we'll recompute it later).		MachineInstrBuilder MIB;
MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,		LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: "
InsertAt->getDebugLoc(),		<< *Block.getPredicateThen());
TII->get(ARM::MVE_VPST));		if (VCMP) {
		// Combine the VPST and VCMP into a VPT
		MIB =
		BuildMI(*InsertAt->getParent(), InsertAt, InsertAt->getDebugLoc(),
		TII->get(VCMPOpcodeToVPT(VCMP->getOpcode())));
		MIB.addImm(ARMVCC::Then);
		// Register one
		MIB.add(VCMP->getOperand(1));
		dmgreenUnsubmitted Not Done Reply Inline Actions The VPT block pass has some very similar code. Do we need to check that Operand 1 and Operand 2 have not been modified between the VCMP and where we are materializing the VPT to? dmgreen: The VPT block pass has some very similar code. Do we need to check that Operand 1 and Operand 2…
		samtebbsAuthorUnsubmitted Done Reply Inline Actions That is a good spot. I'll submit a check for this in a follow-up. samtebbs: That is a good spot. I'll submit a check for this in a follow-up.
		// Register two
		MIB.add(VCMP->getOperand(2));
		// The comparison code, e.g. ge, eq, lt
		MIB.add(VCMP->getOperand(3));
		LLVM_DEBUG(dbgs()
		<< "ARM Loops: Combining with VCMP to VPT: " << *MIB);
		LoLoop.ToRemove.insert(VCMP);
		} else {
		// Create a VPST (with a null mask for now, we'll recompute it later)
		// or a VPT in case there was a VCMP right before it
		MIB = BuildMI(*InsertAt->getParent(), InsertAt,
		InsertAt->getDebugLoc(), TII->get(ARM::MVE_VPST));
MIB.addImm(0);		MIB.addImm(0);
LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);		LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
		}
LoLoop.ToRemove.insert(Block.getPredicateThen());		LoLoop.ToRemove.insert(Block.getPredicateThen());
LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());		LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
}		}
// Else, if the block uses a vpt, iterate over the block, removing the		// Else, if the block uses a vpt, iterate over the block, removing the
// extra VCTPs it may contain.		// extra VCTPs it may contain.
else if (Block.isVPT()) {		else if (Block.isVPT()) {
bool RemovedVCTP = false;		bool RemovedVCTP = false;
for (PredicatedMI &Elt : Block.getInsts()) {		for (PredicatedMI &Elt : Block.getInsts()) {
▲ Show 20 Lines • Show All 158 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll

This file was added.

				; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=force-enabled-no-reductions -o - %s \| FileCheck %s

				samparkerUnsubmitted Done Reply Inline Actions Probably best not to run at -O3, just in case upstream/downstream have different optimisation pipelines. samparker: Probably best not to run at -O3, just in case upstream/downstream have different optimisation…
				define arm_aapcs_vfpcc <16 x i8> @vcmp_vpst_combination(<16 x i8>* %pSrc, i16 zeroext %blockSize, i8* nocapture %pResult, i32* nocapture %pIndex) {
				; CHECK-LABEL: vcmp_vpst_combination:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: .save {r7, lr}
				; CHECK-NEXT: push {r7, lr}
				; CHECK-NEXT: vmov.i8 q0, #0x7f
				; CHECK-NEXT: dlstp.8 lr, r1
				; CHECK-NEXT: .LBB0_1: @ %do.body
				; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: vldrb.u8 q1, [r0]
				; CHECK-NEXT: vpt.s8 ge, q0, q1
				; CHECK-NEXT: vmovt q0, q1
				; CHECK-NEXT: letp lr, .LBB0_1
				; CHECK-NEXT: @ %bb.2: @ %do.end
				; CHECK-NEXT: pop {r7, pc}
				entry:
				%conv = zext i16 %blockSize to i32
				%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 0, i32 1)
				%1 = extractvalue { <16 x i8>, i32 } %0, 0
				br label %do.body

				do.body: ; preds = %do.body, %entry
				%indexVec.0 = phi <16 x i8> [ %1, %entry ], [ %add, %do.body ]
				%curExtremIdxVec.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %6, %do.body ]
				%curExtremValVec.0 = phi <16 x i8> [ <i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127>, %entry ], [ %6, %do.body ]
				%blkCnt.0 = phi i32 [ %conv, %entry ], [ %sub2, %do.body ]
				%2 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %blkCnt.0)
				%3 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %pSrc, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
				%4 = icmp sle <16 x i8> %3, %curExtremValVec.0
				%5 = and <16 x i1> %4, %2
				%6 = tail call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> %3, <16 x i8> %3, <16 x i1> %5, <16 x i8> %curExtremValVec.0)
				%add = add <16 x i8> %indexVec.0, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
				%sub2 = add nsw i32 %blkCnt.0, -16
				%cmp = icmp sgt i32 %blkCnt.0, 16
				br i1 %cmp, label %do.body, label %do.end

				do.end: ; preds = %do.body
				ret <16 x i8> %6
				}

				declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)

				declare <16 x i1> @llvm.arm.mve.vctp8(i32)

				declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)

				declare <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][LowOverheadLoops] Combine a VCMP and VPST into a VPTClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 291890

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll

[ARM][LowOverheadLoops] Combine a VCMP and VPST into a VPT
ClosedPublic