This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
include/llvm/Analysis/
-
llvm/
-
Analysis/
-
TargetTransformInfo.h
-
TargetTransformInfoImpl.h
-
lib/
-
Analysis/
-
TargetTransformInfo.cpp
-
Target/AArch64/
-
AArch64/
1
AArch64TargetTransformInfo.h
-
Transforms/Utils/
-
Utils/
3/3
SimplifyCFG.cpp
-
test/Transforms/SimplifyCFG/AArch64/
-
Transforms/
-
SimplifyCFG/
-
AArch64/
-
check-convert-to-cond-instr.ll

Differential D67281

[AArch64][SimplifyCFG] Add additional cost for instructions in mergeConditionalStoreToAddress
AbandonedPublic

Authored by kpdev42 on Sep 6 2019, 8:37 AM.

Download Raw Diff

Details

Reviewers

efriedma
lebedev.ri

Summary

There is a regression in AArch64 after commit https://reviews.llvm.org/rL317368.
Some blocks are considered worthwile to merge conditional stores, but these merges lead to unoptimal code (bug https://bugs.llvm.org/show_bug.cgi?id=43205 PR#43205 )
So it is needed to add some additional cost to instructions, if these instructions cannot be represented by 1 conditional instructions.
For example:
add nsw i32 %tmp 1 can be represented as cset
or will be split in csel and orr

Diff Detail

Event Timeline

kpdev42 created this revision.Sep 6 2019, 8:37 AM

Herald added subscribers: llvm-commits, hiraditya, kristof.beyls. · View Herald TranscriptSep 6 2019, 8:37 AM

Maybe the reason of this regression is not here and I worked in wrong direction. If so - I will really appreciate any clarifications about it

I'm trying to understand the issue you're seeing... I guess it comes down to something like the following?

Before:

        tst     x18, x1
        b.eq    .LBB0_10
.LBB0_9:
        orr     x16, x16, x18
        add     w0, w0, #1
        str     xzr, [x13, #56]
.LBB0_10:
        cbz     x11, .LBB0_7

After:

and     x2, x18, x1
orr     x3, x16, x18
tst     x18, x1
orr     x2, x2, x11
cinc    w0, w0, ne
csel    x16, x16, x3, eq
cbz     x2, .LBB0_7

I agree, five instructions is probably too many to speculate to eliminate a store. But the patch doesn't really reflect the actual costs here, which largely have to do with the PHI->select transform rather than the actual arithmetic instructions.

Here's the relevant IR after the store merging transform:

if.then:                                          ; preds = %for.body12
  %or = or i64 %res_in7.046, %bit.044
  %inc = add nsw i32 %retval1.243, 1
  br label %if.end

if.end:                                           ; preds = %if.then, %for.body12
  %retval1.3 = phi i32 [ %inc, %if.then ], [ %retval1.243, %for.body12 ]
  %res_in7.1 = phi i64 [ %or, %if.then ], [ %res_in7.046, %for.body12 ]
  br i1 %tobool14, label %for.inc, label %if.then15

if.then15:                                        ; preds = %if.end
  br label %for.inc

for.inc:                                          ; preds = %if.then15, %if.end
  %simplifycfg.merge = phi i8* [ null, %if.then15 ], [ null, %if.end ]
  %4 = xor i1 %tobool, true
  %5 = xor i1 %tobool14, true
  %6 = or i1 %4, %5
  br i1 %6, label %7, label %8

7:                                                ; preds = %for.inc
  store i8* %simplifycfg.merge, i8** %proc, align 8
  br label %8

8:                                                ; preds = %for.inc, %7
  %inc18 = add nuw nsw i32 %j.047, 1
  %shl = shl i64 %bit.044, 1
  %exitcond = icmp eq i32 %inc18, 64
  br i1 %exitcond, label %for.cond.cleanup11, label %for.body12

Some of the branches collapse; at this point, we've basically traded a store for a branch which is likely predictable, assuming we turn the "or i1" back into a branch. That's maybe okay, I guess? But then we decide "if.then" is small enough to "predicate" it at the IR level, and we never reverse that decision when it turns out that doesn't simplify anything.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
179	This makes no sense; integer logic should be cheap.
llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
2 ↗	(On Diff #219108)	This testcase is way too complicated given the change you're making.

Thank you for detailed answer.

But then we decide "if.then" is small enough to "predicate" it at the IR level, and we never reverse that decision when it turns out that doesn't simplify anything.

Yes, I totally agree with you.

I agree, five instructions is probably too many to speculate to eliminate a store. But the patch doesn't really reflect the actual costs here, which largely have to do with the PHI->select transform rather than the actual arithmetic instructions.

Two phi nodes are replaced with select instructions in static bool FoldTwoEntryPHINode (which is invoked after mergeConditionalStoreToAddress)

while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
  // Change the PHI node into a select instruction.
  Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
  Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);

  Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
  PN->replaceAllUsesWith(Sel);
  Sel->takeName(PN);
  PN->eraseFromParent();
}

So it is needed to add some additional analysis here? I mean - FoldTwoEntryPHINode should make some checks about expediency of this folding?

This needs a testcase (check-instr-cost-for-folding.ll isn't it.)
Can you see if D67315 already solves whatever issue you are seeing?

I checked D67315 , unfortunately it does not help, generated code is the same.

Simplify test case. Relevant C code is:

typedef struct {
  int * dummy;
  void *proc;
} ptr_wrapper;

ptr_wrapper * fds;

int do_select(const unsigned long in, 
              unsigned long bit,
              const unsigned long mask)
{
  int retval = 0;
  unsigned long res_in = 0;

  for( ; bit != 0; bit <<= 1)
  {
  	if (in & bit) {
  		res_in |= bit;
  		retval++;
  		fds->proc = NULL;
  	}
  	
  	if (mask & 0x1) {
  		fds->proc = NULL;
  	}
  }

  return retval + res_in;
}

Rework patch. Allow merge store but bring additional check to FoldTwoEntryPHINode: before hoisting all instructions from IfBlock0 and IfBlock1 to DomBlock we check number of instructions in this DomBlock.

Please upload patches with full context (-U99999)
Patches should be as compared to the trunk, not previous patch.

lebedev.ri requested changes to this revision.Sep 11 2019, 3:57 AM

This revision now requires changes to proceed.Sep 11 2019, 3:57 AM

Yes, sure, will upload it in 24 hours, thank you!

I need to change my patch entirely, it needs discussion in mailing list and it will take time, so at the moment I will close this revision and after some time create another one.
Thanks to all.

I really wonder if the SimplifyCFG behavior is correct and you want to instead be tuning the undo transform in backend..

Yes, I think it is possible, that this code should be changed later in backend. It needs more investigation )

Herald added a subscriber: dmgreen. · View Herald TranscriptSep 16 2019, 6:53 AM

kpdev42 reclaimed this revision.Nov 20 2019, 4:51 AM

This revision now requires changes to proceed.Nov 20 2019, 4:51 AM

I changed my patch, now before merging blocks it calculates existing instructions in DomBB,
just to be sure that this block is not too big already
Maybe instead of using phi-node-folding threshold it is needed to use some other separate threshold.

And also it is still not clear, if we need to optimize this in a middle end or in a backend.
Maybe backend will be more proper place for it. @dmgreen please feel free to bring a light to this problem

P.S.: In this change I also simplified a testcase

Recall, what exactly I trying to eliminate:
clang merges too big basic blocks and it leads to performance regression

Before merging:

        tst     x18, x1
        b.eq    .LBB0_10
.LBB0_9:
        orr     x16, x16, x18
        add     w0, w0, #1
        str     xzr, [x13, #56]
.LBB0_10:
        cbz     x11, .LBB0_7

After merging

and     x2, x18, x1
orr     x3, x16, x18
tst     x18, x1
orr     x2, x2, x11
cinc    w0, w0, ne
csel    x16, x16, x3, eq
cbz     x2, .LBB0_7

Similar picture in test case:
Before merging:

for.body.lr.ph:                                   ; preds = %entry
  %0 = load %struct.ptr_wrapper*, %struct.ptr_wrapper** @g_wrapper, align 8
  %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %0, i64 0, i32 1
  %and2 = and i64 %mask, 1
  %tobool3 = icmp eq i64 %and2, 0
  %and = and i64 %bit, %in
  %tobool = icmp eq i64 %and, 0
  br i1 %tobool, label %if.end, label %if.then

if.then:                                          ; preds = %for.body.lr.ph
  %or = or i64 0, %bit
  %inc = add nsw i32 0, 1
  br label %if.end

if.end:                                           ; preds = %if.then, %for.body.lr.ph
  %retval1.1 = phi i32 [ %inc, %if.then ], [ 0, %for.body.lr.ph ]
  %res_in.1 = phi i64 [ %or, %if.then ], [ 0, %for.body.lr.ph ]
  %1 = xor i1 %tobool, true
  %2 = xor i1 %tobool3, true
  %3 = or i1 %1, %2
  store i8* null, i8** %proc, align 8
  %shl = shl i64 %bit, 1
  %cmp = icmp eq i64 %shl, 0
  br label %for.end

After merging:

for.body.lr.ph:                                   ; preds = %entry
  %0 = load %struct.ptr_wrapper*, %struct.ptr_wrapper** @g_wrapper, align 8
  %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %0, i64 0, i32 1
  %and2 = and i64 %mask, 1
  %tobool3 = icmp eq i64 %and2, 0
  %and = and i64 %bit, %in
  %tobool = icmp eq i64 %and, 0
  %or = or i64 0, %bit
  %inc = add nsw i32 0, 1
  %retval1.1 = select i1 %tobool, i32 0, i32 %inc
  %res_in.1 = select i1 %tobool, i64 0, i64 %or
  %1 = xor i1 %tobool, true
  %2 = xor i1 %tobool3, true
  %3 = or i1 %1, %2
  store i8* null, i8** %proc, align 8
  %shl = shl i64 %bit, 1
  %cmp = icmp eq i64 %shl, 0
  br label %for.end

lebedev.ri added inline comments.Nov 20 2019, 5:33 AM

llvm/lib/Transforms/Utils/SimplifyCFG.cpp
2429	Not clang-formatted
llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
25–29 ↗	(On Diff #230235)	I think this testcase is overreduced, which makes it impossible to actually guess what the fix should be. https://godbolt.org/z/sxmNMd

Fix formatting in SimplifyCFG.cpp and remaster testcase (I reduced it more accurately to avoid overreducing)
Please @lebedev.ri take a look

On X86 branch misprediction is ~10..~20 cycles,
Is it dirt cheap on AArch64? (any number?)
Perhaps we need to redefine the threshold in terms of branch misprediction cost?

llvm/lib/Transforms/Utils/SimplifyCFG.cpp
2421–2424	So in other words we'd only perform the fold only if the preceding block is not larger than what we'd add via folding. In other words if we are okay flatteing 4-instruction 2-entry PHI, the dominating BB must contain less than 4 instructions. That seems awfully hand-wavy to me, i'm afraid :( It will make the fold not happen in all the cases i'm aware of.

In D67281#1756463, @lebedev.ri wrote:

On X86 branch misprediction is ~10..~20 cycles,

Very interesting, I didn't measure cycles of misprediction. Only time of execution.

Is it dirt cheap on AArch64? (any number?)

Excuse me, I didn't get the question :) You asked about number of branch misprediction cycles?

Perhaps we need to redefine the threshold in terms of branch misprediction cost?

It is actually sounds very promising. But at the moment I do not know where to get this cost :) Is it supposed to be in processor description (e.g. for Cortex-75 -https://developer.arm.com/docs/100403/0301 ) ?

And yet another thought: maybe we will just compare execution latency / throughput for merged and non-merged variants and will choose the variant with the smallest total value?

For example (all data about latency/throughput is taken from https://developer.arm.com/docs/101398/0200/arm-cortex-a75-software-optimization-guide-v20 / code taken from https://bugs.llvm.org/show_bug.cgi?id=43205 ):

Non-merged variant
                            | Execution Latency | Execution Throughput
---------------------------------------------------------------------
tst     x18, x1             | 1                 | 2
b.eq    .LBB0_10            | 1                 | 1
.LBB0_9:                    |                   | 
orr     x16, x16, x18       | 1                 | 2
add     w0, w0, #1          | 1                 | 2
str     xzr, [x13, #56]     | 1                 | 1
.LBB0_10:                   |                   | 
cbz     x11, .LBB0_7        | 1                 | 1
str	xzr, [x13,#56]          | 1                 | 1
---------------------------------------------------------------------
Total:                      | 7                 | 10

Merged variant
                            | Execution Latency | Execution Throughput
---------------------------------------------------------------------
and	x3, x2, x1              | 1                 | 2
tst	x2, x1                  | 1                 | 2
orr	x5, x11, x3             | 1                 | 2
cinc	w0, w0, ne          | 1                 | 2
csel	x3, xzr, x2, eq     | 1                 | 2
cbz	x5, .LBB0_7             | 1                 | 1
str	xzr, [x13,#56]          | 1                 | 1
orr	x16, x16, x3            | 1                 | 2
---------------------------------------------------------------------
Total:                      | 8                 | 14

In case above non-merged variant is better.

Is it valid approach?

llvm/lib/Transforms/Utils/SimplifyCFG.cpp
2421–2424	Yes, agree, using this comparison is a hand-wave as it is :) if (Cost > PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) { So we need a separate threshold here. I will change it today

I have added check for additional instructions which will be generated after select lowering (In lambda IsWorthwile in mergeConditionalStoreToAddress)
Update test.

I think it could be useful, if I add my small runtime benchmark here:

small_benchmark.c3 KBDownload

According to this benchmark patched version better than original for about 2 times

Compiled with flags: clang -target aarch64-linux-android -O2
Device with CortexA76

To = execution time of original version (average for 1000 iterations)
Tp = execution time of patched version (average for 1000 iterations)

                          To / Tp 
Match all branches.       2.11
Match no branches.        2.12
Match IN branch only.     2.12
Match OUT branch only.    2.11
Match EX branch only.     2.07
Match IN & OUT branches.  2.13
Match IN & EX branches.   2.15
Match OUT & EX branches.  2.08

I'm sorry.
I'm sure the problem this is trying to address is real, but i don't have any suggestions how it should be fixed.
I somewhat think that the current branch->select logic is still too weak, and this problem should be fixed in backend.

kpdev42 marked 2 inline comments as done.Oct 25 2021, 1:54 AM

kpdev42 abandoned this revision.Dec 23 2022, 5:34 AM

Herald added a project: Restricted Project. · View Herald TranscriptDec 23 2022, 5:34 AM

Revision Contents

Path

Size

llvm/

include/

llvm/

Analysis/

TargetTransformInfo.h

32 lines

TargetTransformInfoImpl.h

5 lines

lib/

Analysis/

TargetTransformInfo.cpp

4 lines

Target/

AArch64/

AArch64TargetTransformInfo.h

9 lines

Transforms/

Utils/

SimplifyCFG.cpp

9 lines

test/

Transforms/

SimplifyCFG/

AArch64/

check-convert-to-cond-instr.ll

59 lines

Diff 234845

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 381 Lines • ▼ Show 20 Lines	public:

/// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p		/// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
/// NewV, which has a different address space. This should happen for every		/// NewV, which has a different address space. This should happen for every
/// operand index that collectFlatAddressOperands returned for the intrinsic.		/// operand index that collectFlatAddressOperands returned for the intrinsic.
/// \returns true if the intrinsic /// was handled.		/// \returns true if the intrinsic /// was handled.
bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,		bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value OldV, Value NewV) const;		Value OldV, Value NewV) const;

		/// \brief isExpensiveToConvertToCondInstr checks if given instruction can be converted
		/// to conditional instruction (e.g. csel/cset for AArch64) without generating additional instructions,
		/// if not - then instruction is considered as expensive for converting.
		/// In some cases (e.g. BB merge) such convertions may lead to performance downgrade.
		/// Example of cheap convertion (for AArch64):
		/// \code
		/// add w0, w0, #1
		/// \endcode
		/// can be converted to one instruction
		/// \code
		/// cinc w0, w0, <condition_flag>
		/// \endcode
		/// Example of expensive convertion (for AArch64):
		/// \code
		/// orr x16, x16, x18
		/// \endcode
		/// can not be represented by one conditional instruction
		/// therefore additional instruction (`orr`) will be generated:
		/// \code
		/// orr x3, x16, x18
		/// csel x16, x16, x3, <condition_flag>
		/// \endcode
		/// \param I - instructions for checking
		/// \return true if this convertion will leads to generation of additional instructions
		bool isExpensiveToConvertToCondInstr(const Instruction& I) const;


/// Test whether calls to a function lower to actual program function		/// Test whether calls to a function lower to actual program function
/// calls.		/// calls.
///		///
/// The idea is to test whether the program is likely to require a 'call'		/// The idea is to test whether the program is likely to require a 'call'
/// instruction or equivalent in order to call the given function.		/// instruction or equivalent in order to call the given function.
///		///
/// FIXME: It's not clear that this is a good or useful query API. Client's		/// FIXME: It's not clear that this is a good or useful query API. Client's
/// should probably move to simpler cost metrics using the above.		/// should probably move to simpler cost metrics using the above.
▲ Show 20 Lines • Show All 800 Lines • ▼ Show 20 Lines	public:
virtual int		virtual int
getUserCost(const User U, ArrayRef<const Value > Operands) = 0;		getUserCost(const User U, ArrayRef<const Value > Operands) = 0;
virtual bool hasBranchDivergence() = 0;		virtual bool hasBranchDivergence() = 0;
virtual bool isSourceOfDivergence(const Value *V) = 0;		virtual bool isSourceOfDivergence(const Value *V) = 0;
virtual bool isAlwaysUniform(const Value *V) = 0;		virtual bool isAlwaysUniform(const Value *V) = 0;
virtual unsigned getFlatAddressSpace() = 0;		virtual unsigned getFlatAddressSpace() = 0;
virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,		virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const = 0;		Intrinsic::ID IID) const = 0;
		virtual bool isExpensiveToConvertToCondInstr(const Instruction& I) const = 0;
virtual bool rewriteIntrinsicWithAddressSpace(		virtual bool rewriteIntrinsicWithAddressSpace(
IntrinsicInst II, Value OldV, Value *NewV) const = 0;		IntrinsicInst II, Value OldV, Value *NewV) const = 0;
virtual bool isLoweredToCall(const Function *F) = 0;		virtual bool isLoweredToCall(const Function *F) = 0;
virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,		virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
UnrollingPreferences &UP) = 0;		UnrollingPreferences &UP) = 0;
virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,		virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,		AssumptionCache &AC,
TargetLibraryInfo *LibInfo,		TargetLibraryInfo *LibInfo,
▲ Show 20 Lines • Show All 255 Lines • ▼ Show 20 Lines	bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
return Impl.collectFlatAddressOperands(OpIndexes, IID);		return Impl.collectFlatAddressOperands(OpIndexes, IID);
}		}

bool rewriteIntrinsicWithAddressSpace(		bool rewriteIntrinsicWithAddressSpace(
IntrinsicInst II, Value OldV, Value *NewV) const override {		IntrinsicInst II, Value OldV, Value *NewV) const override {
return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);		return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
}		}

		bool isExpensiveToConvertToCondInstr(const Instruction& I) const override {
		return Impl.isExpensiveToConvertToCondInstr(I);
		}

bool isLoweredToCall(const Function *F) override {		bool isLoweredToCall(const Function *F) override {
return Impl.isLoweredToCall(F);		return Impl.isLoweredToCall(F);
}		}
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UnrollingPreferences &UP) override {		UnrollingPreferences &UP) override {
return Impl.getUnrollingPreferences(L, SE, UP);		return Impl.getUnrollingPreferences(L, SE, UP);
}		}
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,		bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
▲ Show 20 Lines • Show All 501 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 164 Lines • ▼ Show 20 Lines	bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
return false;		return false;
}		}

bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,		bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value OldV, Value NewV) const {		Value OldV, Value NewV) const {
return false;		return false;
}		}

		bool isExpensiveToConvertToCondInstr(const Instruction& I) const {
		static_cast<void>(I);
		return false;
		}

bool isLoweredToCall(const Function *F) {		bool isLoweredToCall(const Function *F) {
assert(F && "A concrete function must be provided to this routine.");		assert(F && "A concrete function must be provided to this routine.");

// FIXME: These should almost certainly not be handled here, and instead		// FIXME: These should almost certainly not be handled here, and instead
// handled with the help of TLI or the target itself. This was largely		// handled with the help of TLI or the target itself. This was largely
// ported from existing analysis heuristics here so that such refactorings		// ported from existing analysis heuristics here so that such refactorings
// can take place in the future.		// can take place in the future.

▲ Show 20 Lines • Show All 753 Lines • Show Last 20 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 228 Lines • ▼ Show 20 Lines	bool TargetTransformInfo::collectFlatAddressOperands(
return TTIImpl->collectFlatAddressOperands(OpIndexes, IID);		return TTIImpl->collectFlatAddressOperands(OpIndexes, IID);
}		}

bool TargetTransformInfo::rewriteIntrinsicWithAddressSpace(		bool TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
IntrinsicInst II, Value OldV, Value *NewV) const {		IntrinsicInst II, Value OldV, Value *NewV) const {
return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);		return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
}		}

		bool TargetTransformInfo::isExpensiveToConvertToCondInstr(const Instruction& I) const {
		return TTIImpl->isExpensiveToConvertToCondInstr(I);
		}

bool TargetTransformInfo::isLoweredToCall(const Function *F) const {		bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
return TTIImpl->isLoweredToCall(F);		return TTIImpl->isLoweredToCall(F);
}		}

bool TargetTransformInfo::isHardwareLoopProfitable(		bool TargetTransformInfo::isHardwareLoopProfitable(
Loop *L, ScalarEvolution &SE, AssumptionCache &AC,		Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {		TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);		return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
▲ Show 20 Lines • Show All 1,150 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 170 Lines • ▼ Show 20 Lines	public:
bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {		bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
return isLegalMaskedLoadStore(DataType, Alignment);		return isLegalMaskedLoadStore(DataType, Alignment);
}		}

int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,		int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,		ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace,		unsigned AddressSpace,
bool UseMaskForCond = false,		bool UseMaskForCond = false,
bool UseMaskForGaps = false);		bool UseMaskForGaps = false);
		efriedmaUnsubmitted Not Done Reply Inline Actions This makes no sense; integer logic should be cheap. efriedma: This makes no sense; integer logic should be cheap.

bool		bool
shouldConsiderAddressTypePromotion(const Instruction &I,		shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader);		bool &AllowPromotionWithoutCommonHeader);

bool shouldExpandReduction(const IntrinsicInst *II) const {		bool shouldExpandReduction(const IntrinsicInst *II) const {
return false;		return false;
}		}

unsigned getGISelRematGlobalCost() const {		unsigned getGISelRematGlobalCost() const {
return 2;		return 2;
}		}

		bool isExpensiveToConvertToCondInstr(const Instruction& I) const {
		switch (I.getOpcode()) {
		case Instruction::Or:
		return true;
		default:
		return false;
		}
		}

bool useReductionIntrinsic(unsigned Opcode, Type *Ty,		bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;		TTI::ReductionFlags Flags) const;

int getArithmeticReductionCost(unsigned Opcode, Type *Ty,		int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm);		bool IsPairwiseForm);

int getShuffleCost(TTI::ShuffleKind Kind, Type Tp, int Index, Type SubTp);		int getShuffleCost(TTI::ShuffleKind Kind, Type Tp, int Index, Type SubTp);
/// @}		/// @}
};		};

} // end namespace llvm		} // end namespace llvm

#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H		#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

Show First 20 Lines • Show All 128 Lines • ▼ Show 20 Lines	static cl::opt<bool> SpeculateOneExpensiveInst(
cl::desc("Allow exactly one expensive instruction to be speculatively "		cl::desc("Allow exactly one expensive instruction to be speculatively "
"executed"));		"executed"));

static cl::opt<unsigned> MaxSpeculationDepth(		static cl::opt<unsigned> MaxSpeculationDepth(
"max-speculation-depth", cl::Hidden, cl::init(10),		"max-speculation-depth", cl::Hidden, cl::init(10),
cl::desc("Limit maximum recursion depth when calculating costs of "		cl::desc("Limit maximum recursion depth when calculating costs of "
"speculatively executed instructions"));		"speculatively executed instructions"));

		static cl::opt<bool> CheckConvertionToCondInstrCost(
		"check-convert-to-cond-instr", cl::Hidden, cl::init(true),
		cl::desc("When merging conditional stores, add additional cost to instructions "
		"which cannot be represented by one conditional instruction"));

STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");		STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
STATISTIC(NumLinearMaps,		STATISTIC(NumLinearMaps,
"Number of switch instructions turned into linear mapping");		"Number of switch instructions turned into linear mapping");
STATISTIC(NumLookupTables,		STATISTIC(NumLookupTables,
"Number of switch instructions turned into lookup tables");		"Number of switch instructions turned into lookup tables");
STATISTIC(		STATISTIC(
NumLookupTablesHoles,		NumLookupTablesHoles,
"Number of switch instructions turned into lookup tables (holes checked)");		"Number of switch instructions turned into lookup tables (holes checked)");
▲ Show 20 Lines • Show All 2,263 Lines • ▼ Show 20 Lines	for (BasicBlock::iterator I = IfBlock2->begin(); !I->isTerminator(); ++I)
}		}
}		}
assert(DomBlock && "Failed to find root DomBlock");		assert(DomBlock && "Failed to find root DomBlock");

LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond		LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond
<< " T: " << IfTrue->getName()		<< " T: " << IfTrue->getName()
<< " F: " << IfFalse->getName() << "\n");		<< " F: " << IfFalse->getName() << "\n");

// If we can still promote the PHI nodes after this gauntlet of tests,		// If we can still promote the PHI nodes after this gauntlet of tests,
// do all of the PHI's now.		// do all of the PHI's now.
Instruction *InsertPt = DomBlock->getTerminator();		Instruction *InsertPt = DomBlock->getTerminator();
IRBuilder<NoFolder> Builder(InsertPt);		IRBuilder<NoFolder> Builder(InsertPt);
		lebedev.riUnsubmitted Done Reply Inline Actions So in other words we'd only perform the fold only if the preceding block is not larger than what we'd add via folding. In other words if we are okay flatteing 4-instruction 2-entry PHI, the dominating BB must contain less than 4 instructions. That seems awfully hand-wavy to me, i'm afraid :( It will make the fold not happen in all the cases i'm aware of. lebedev.ri: So in other words we'd only perform the fold only if the preceding block is not larger than…
		kpdev42AuthorUnsubmitted Done Reply Inline Actions Yes, agree, using this comparison is a hand-wave as it is :) if (Cost > PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) { So we need a separate threshold here. I will change it today kpdev42: Yes, agree, using this comparison is a hand-wave as it is :) ``` if (Cost >…

// Move all 'aggressive' instructions, which are defined in the		// Move all 'aggressive' instructions, which are defined in the
// conditional parts of the if's up to the dominating block.		// conditional parts of the if's up to the dominating block.
if (IfBlock1)		if (IfBlock1)
hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1);		hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1);
		lebedev.riUnsubmitted Done Reply Inline Actions Not clang-formatted lebedev.ri: Not clang-formatted
if (IfBlock2)		if (IfBlock2)
hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);		hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);

// Propagate fast-math-flags from phi nodes to replacement selects.		// Propagate fast-math-flags from phi nodes to replacement selects.
IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);		IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {		while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
if (isa<FPMathOperator>(PN))		if (isa<FPMathOperator>(PN))
Builder.setFastMathFlags(PN->getFastMathFlags());		Builder.setFastMathFlags(PN->getFastMathFlags());
▲ Show 20 Lines • Show All 587 Lines • ▼ Show 20 Lines	for (auto &I : BB->instructionsWithoutDebug()) {
if (llvm::find(FreeStores, S))		if (llvm::find(FreeStores, S))
continue;		continue;
// Else, we have a white-list of instructions that we are ak speculating.		// Else, we have a white-list of instructions that we are ak speculating.
if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))		if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
return false; // Not in white-list - not worthwhile folding.		return false; // Not in white-list - not worthwhile folding.
// And finally, if this is a non-free instruction that we are okay		// And finally, if this is a non-free instruction that we are okay
// speculating, ensure that we consider the speculation budget.		// speculating, ensure that we consider the speculation budget.
BudgetRemaining -= TTI.getUserCost(&I);		BudgetRemaining -= TTI.getUserCost(&I);
		// Check if this instruction can be converted to conditional form
		// without generating additional instructions
		if (CheckConvertionToCondInstrCost && TTI.isExpensiveToConvertToCondInstr(I))
		BudgetRemaining--; // Additional instructions will be generated, reduce budget
if (BudgetRemaining < 0)		if (BudgetRemaining < 0)
return false; // Eagerly refuse to fold as soon as we're out of budget.		return false; // Eagerly refuse to fold as soon as we're out of budget.
}		}
assert(BudgetRemaining >= 0 &&		assert(BudgetRemaining >= 0 &&
"When we run out of budget we will eagerly return from within the "		"When we run out of budget we will eagerly return from within the "
"per-instruction loop.");		"per-instruction loop.");
return true;		return true;
};		};
▲ Show 20 Lines • Show All 3,160 Lines • Show Last 20 Lines

llvm/test/Transforms/SimplifyCFG/AArch64/check-convert-to-cond-instr.ll

This file was added.

				; RUN: llc %s -O2 -mtriple=aarch64-linux-gnu -check-convert-to-cond-instr=true -o %t
				; RUN: FileCheck --check-prefix=CHECKCOST %s < %t
				; RUN: llc %s -O2 -mtriple=aarch64-linux-gnu -check-convert-to-cond-instr=false -o %t
				; RUN: FileCheck --check-prefix=NOTCHECKCOST %s < %t

				; CHECKCOST-LABEL: .LBB0_2:
				; CHECKCOST: tst
				; CHECKCOST-NEXT: b.eq
				; NOTCHECKCOST-LABEL: .LBB0_2:
				; NOTCHECKCOST: and [[DSTREG:w[0-9]+]], [[SRCREG1:w[0-9]+]], [[SRCREG2:w[0-9]+]]
				; NOTCHECKCOST-NEXT: orr [[OR_DSTREG:w[0-9]+]], [[OR_SRCREG1:w[0-9]+]], [[SRCREG1]]
				; NOTCHECKCOST-NEXT: tst [[SRCREG1]], [[SRCREG2]]
				; NOTCHECKCOST-NEXT: orr [[DSTREG]], [[DSTREG]], [[OR2_SRCREG:w[0-9]+]]
				; NOTCHECKCOST-NEXT: cinc
				; NOTCHECKCOST-NEXT: csel

				%struct.anon = type { i32, i32 }

				@g_ptr = common dso_local local_unnamed_addr global %struct.anon* null, align 8

				define dso_local i32 @test_func(i32 %in, i32 %bit, i32 %mask) local_unnamed_addr {
				entry:
				%0 = load %struct.anon, %struct.anon* @g_ptr, align 8
				%result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
				%tobool2 = icmp eq i32 %mask, 0
				br label %do.body

				do.body: ; preds = %do.cond, %entry
				%bit.addr.0 = phi i32 [ %bit, %entry ], [ %shl, %do.cond ]
				%retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %do.cond ]
				%sum_bits.0 = phi i32 [ 0, %entry ], [ %sum_bits.1, %do.cond ]
				%and = and i32 %bit.addr.0, %in
				%tobool = icmp eq i32 %and, 0
				br i1 %tobool, label %if.end, label %if.then

				if.then: ; preds = %do.body
				%or = or i32 %sum_bits.0, %bit.addr.0
				%inc = add i32 %retval1.0, 1
				store i32* null, i32** %result, align 8
				br label %if.end

				if.end: ; preds = %do.body, %if.then
				%retval1.1 = phi i32 [ %inc, %if.then ], [ %retval1.0, %do.body ]
				%sum_bits.1 = phi i32 [ %or, %if.then ], [ %sum_bits.0, %do.body ]
				br i1 %tobool2, label %do.cond, label %if.then3

				if.then3: ; preds = %if.end
				store i32* null, i32** %result, align 8
				br label %do.cond

				do.cond: ; preds = %if.end, %if.then3
				%shl = shl i32 %bit.addr.0, 1
				%tobool6 = icmp eq i32 %shl, 0
				br i1 %tobool6, label %do.end, label %do.body

				do.end: ; preds = %do.cond
				%add = add i32 %sum_bits.1, %retval1.1
				ret i32 %add
				}
				No newline at end of file