This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Backedge indexing
AbandonedPublic

Authored by SjoerdMeijer on Oct 21 2020, 9:19 AM.

Download Raw Diff

Details

Reviewers

dmgreen
fhahn
efriedma
sanwou01
samparker

Summary

Bit of a brain dump because I was seeing the same problems with addressing modes in unrolled loops and is completely related to what @SjoerdMeijer is currently working on in D89693 and I doubt I will have time to look more into this...

For the benchmark that I am looking at, the total size shrinks, but there seems to be a problem because we no longer generate the LDPs, (which I presume this is just a current limitation of the AArch64LoadStoreOptimizer?):

< 	ldp	q0, q2, [x2, #-16]
< 	ldp	q1, q3, [x4, #-16]
< 	subs	x5, x5, #8                      // =8
< 	add	x4, x4, #32                     // =32
< 	add	x2, x2, #32                     // =32
< 	fmul	v0.4s, v0.4s, v1.4s
< 	fmul	v2.4s, v2.4s, v3.4s
< 	ldp	q1, q3, [x3, #-16]
< 	fadd	v0.4s, v1.4s, v0.4s
< 	fadd	v1.4s, v3.4s, v2.4s
< 	stp	q0, q1, [x3, #-16]
< 	add	x3, x3, #32                     // =32
---
> 	ldr	q0, [x5, #32]!
> 	subs	x27, x27, #8                    // =8
> 	ldur	q1, [x5, #-16]
> 	ldr	q2, [x7, #32]!
> 	ldur	q3, [x7, #-16]
> 	ldr	q4, [x6, #32]!
> 	fmul	v0.4s, v0.4s, v2.4s
> 	fmul	v1.4s, v1.4s, v3.4s
> 	ldr	q2, [x6, #16]
> 	fadd	v1.4s, v4.4s, v1.4s
> 	fadd	v0.4s, v2.4s, v0.4s
> 	stp	q1, q0, [x6]

Diff Detail

Unit TestsFailed

	Time	Test
	390 ms	linux > HWAddressSanitizer-x86_64.TestCases::sizes.cpp
	30 ms	linux > LLVM.CodeGen/AArch64::falkor-hwpf-fix.ll
	40 ms	linux > LLVM.CodeGen/AArch64::pr27816.ll
	50 ms	linux > LLVM.CodeGen/AArch64::ragreedy-local-interval-cost.ll
	40 ms	linux > LLVM.Transforms/LoopStrengthReduce/AArch64::lsr-memcpy.ll
		View Full Test Results (11 Failed)

Event Timeline

samparker created this revision.Oct 21 2020, 9:19 AM

Herald added a project: Restricted Project. · View Herald TranscriptOct 21 2020, 9:19 AM

Herald added subscribers: danielkiss, arphaman, hiraditya, kristof.beyls. · View Herald Transcript

samparker requested review of this revision.Oct 21 2020, 9:19 AM

Harbormaster completed remote builds in B75899: Diff 299711.Oct 21 2020, 10:09 AM

SjoerdMeijer mentioned this in D89693: [AArch64] Favor pre-increments and implement TTI::getPreferredAddressingMode.Oct 21 2020, 11:53 AM

I am abandoning this in favour of D89693, which I have repurposed to address this, because most of the discussions happened there.

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64TargetTransformInfo.h

2 lines

AArch64TargetTransformInfo.cpp

31 lines

Diff 299711

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 225 Lines • ▼ Show 20 Lines	case Intrinsic::experimental_vector_reduce_fmin:
return !II->getFastMathFlags().noNaNs();		return !II->getFastMathFlags().noNaNs();

default:		default:
// Don't expand anything else, let legalization deal with it.		// Don't expand anything else, let legalization deal with it.
return false;		return false;
}		}
}		}

		bool shouldFavorBackedgeIndex(const Loop *L) const;

unsigned getGISelRematGlobalCost() const {		unsigned getGISelRematGlobalCost() const {
return 2;		return 2;
}		}

bool useReductionIntrinsic(unsigned Opcode, Type *Ty,		bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;		TTI::ReductionFlags Flags) const;

int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,		int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
Show All 11 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//		//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AArch64ExpandImm.h"		#include "AArch64ExpandImm.h"
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code -#include "AArch64ExpandImm.h" Lint: Pre-merge checks: clang-format: please reformat the code ``` -#include "AArch64ExpandImm.h" ```
#include "AArch64TargetTransformInfo.h"		#include "AArch64TargetTransformInfo.h"
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code +#include "AArch64ExpandImm.h" Lint: Pre-merge checks: clang-format: please reformat the code ``` +#include "AArch64ExpandImm.h" ```
#include "MCTargetDesc/AArch64AddressingModes.h"		#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/LoopInfo.h"		#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"		#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"		#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"		#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"		#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAArch64.h"		#include "llvm/IR/IntrinsicsAArch64.h"
		#include "llvm/IR/Use.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include <algorithm>		#include <algorithm>
using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "aarch64tti"		#define DEBUG_TYPE "aarch64tti"

static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",		static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
cl::init(true), cl::Hidden);		cl::init(true), cl::Hidden);
▲ Show 20 Lines • Show All 940 Lines • ▼ Show 20 Lines	if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
AllowPromotionWithoutCommonHeader = true;		AllowPromotionWithoutCommonHeader = true;
break;		break;
}		}
}		}
}		}
return Considerable;		return Considerable;
}		}

		bool AArch64TTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
		// This optimisation will generally introduce base address modifying
		// instruction(s) into the preheader and is only really useful for
		// unrolled loops, and we don't generally do when optimising for size.
		if (L->getHeader()->getParent()->hasOptSize() \|\|
		L->getNumBlocks() != 1)
		return false;

		// Find pointers with multiple uses within the loop.
		DenseMap<Value *, unsigned> NumPointerUses;
		for (auto &I : *L->getHeader()) {
		if (I.getType()->isPointerTy())
		NumPointerUses[&I] = 0;

		for (auto &Use : I.operands()) {
		if (!Use->getType()->isPointerTy())
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - if (L->getHeader()->getParent()->hasOptSize() \|\| - L->getNumBlocks() != 1) + if (L->getHeader()->getParent()->hasOptSize() \|\| L->getNumBlocks() != 1) Lint: Pre-merge checks: clang-format: please reformat the code ``` - if (L->getHeader()->getParent()->hasOptSize() \|\|…
		continue;
		if (NumPointerUses.count(Use))
		NumPointerUses[Use]++;
		else
		NumPointerUses[Use] = 0;
		}
		}

		return std::any_of(NumPointerUses.begin(), NumPointerUses.end(),
		[](detail::DenseMapPair<Value *, unsigned> Pair) {
		return Pair.second > 1;
		});
		}

bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,		bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {		TTI::ReductionFlags Flags) const {
auto *VTy = cast<VectorType>(Ty);		auto *VTy = cast<VectorType>(Ty);
unsigned ScalarBits = Ty->getScalarSizeInBits();		unsigned ScalarBits = Ty->getScalarSizeInBits();
switch (Opcode) {		switch (Opcode) {
case Instruction::FAdd:		case Instruction::FAdd:
case Instruction::FMul:		case Instruction::FMul:
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - [](detail::DenseMapPair<Value , unsigned> Pair) { - return Pair.second > 1; - }); + [](detail::DenseMapPair<Value , unsigned> Pair) { + return Pair.second > 1; + }); Lint: Pre-merge checks: clang-format: please reformat the code ``` - [](detail::DenseMapPair<Value…
case Instruction::And:		case Instruction::And:
case Instruction::Or:		case Instruction::Or:
case Instruction::Xor:		case Instruction::Xor:
case Instruction::Mul:		case Instruction::Mul:
return false;		return false;
case Instruction::Add:		case Instruction::Add:
return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;		return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
case Instruction::ICmp:		case Instruction::ICmp:
▲ Show 20 Lines • Show All 94 Lines • Show Last 20 Lines