This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
include/llvm/Analysis/
-
llvm/
-
Analysis/
-
TargetTransformInfo.h
-
TargetTransformInfoImpl.h
-
lib/
-
Analysis/
-
TargetTransformInfo.cpp
-
Target/AArch64/
-
AArch64/
1
AArch64TargetTransformInfo.h
-
Transforms/Utils/
-
Utils/
3/3
SimplifyCFG.cpp
-
test/Transforms/SimplifyCFG/AArch64/
-
Transforms/
-
SimplifyCFG/
-
AArch64/
2
check-instr-cost-for-folding.ll

Differential D67281

[AArch64][SimplifyCFG] Add additional cost for instructions in mergeConditionalStoreToAddress
AbandonedPublic

Authored by kpdev42 on Sep 6 2019, 8:37 AM.

Download Raw Diff

Details

Reviewers

efriedma
lebedev.ri

Summary

There is a regression in AArch64 after commit https://reviews.llvm.org/rL317368.
Some blocks are considered worthwile to merge conditional stores, but these merges lead to unoptimal code (bug https://bugs.llvm.org/show_bug.cgi?id=43205 PR#43205 )
So it is needed to add some additional cost to instructions, if these instructions cannot be represented by 1 conditional instructions.
For example:
add nsw i32 %tmp 1 can be represented as cset
or will be split in csel and orr

Diff Detail

Event Timeline

kpdev42 created this revision.Sep 6 2019, 8:37 AM

Herald added subscribers: llvm-commits, hiraditya, kristof.beyls. · View Herald TranscriptSep 6 2019, 8:37 AM

Maybe the reason of this regression is not here and I worked in wrong direction. If so - I will really appreciate any clarifications about it

I'm trying to understand the issue you're seeing... I guess it comes down to something like the following?

Before:

        tst     x18, x1
        b.eq    .LBB0_10
.LBB0_9:
        orr     x16, x16, x18
        add     w0, w0, #1
        str     xzr, [x13, #56]
.LBB0_10:
        cbz     x11, .LBB0_7

After:

and     x2, x18, x1
orr     x3, x16, x18
tst     x18, x1
orr     x2, x2, x11
cinc    w0, w0, ne
csel    x16, x16, x3, eq
cbz     x2, .LBB0_7

I agree, five instructions is probably too many to speculate to eliminate a store. But the patch doesn't really reflect the actual costs here, which largely have to do with the PHI->select transform rather than the actual arithmetic instructions.

Here's the relevant IR after the store merging transform:

if.then:                                          ; preds = %for.body12
  %or = or i64 %res_in7.046, %bit.044
  %inc = add nsw i32 %retval1.243, 1
  br label %if.end

if.end:                                           ; preds = %if.then, %for.body12
  %retval1.3 = phi i32 [ %inc, %if.then ], [ %retval1.243, %for.body12 ]
  %res_in7.1 = phi i64 [ %or, %if.then ], [ %res_in7.046, %for.body12 ]
  br i1 %tobool14, label %for.inc, label %if.then15

if.then15:                                        ; preds = %if.end
  br label %for.inc

for.inc:                                          ; preds = %if.then15, %if.end
  %simplifycfg.merge = phi i8* [ null, %if.then15 ], [ null, %if.end ]
  %4 = xor i1 %tobool, true
  %5 = xor i1 %tobool14, true
  %6 = or i1 %4, %5
  br i1 %6, label %7, label %8

7:                                                ; preds = %for.inc
  store i8* %simplifycfg.merge, i8** %proc, align 8
  br label %8

8:                                                ; preds = %for.inc, %7
  %inc18 = add nuw nsw i32 %j.047, 1
  %shl = shl i64 %bit.044, 1
  %exitcond = icmp eq i32 %inc18, 64
  br i1 %exitcond, label %for.cond.cleanup11, label %for.body12

Some of the branches collapse; at this point, we've basically traded a store for a branch which is likely predictable, assuming we turn the "or i1" back into a branch. That's maybe okay, I guess? But then we decide "if.then" is small enough to "predicate" it at the IR level, and we never reverse that decision when it turns out that doesn't simplify anything.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
179	This makes no sense; integer logic should be cheap.
llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
2	This testcase is way too complicated given the change you're making.

Thank you for detailed answer.

But then we decide "if.then" is small enough to "predicate" it at the IR level, and we never reverse that decision when it turns out that doesn't simplify anything.

Yes, I totally agree with you.

I agree, five instructions is probably too many to speculate to eliminate a store. But the patch doesn't really reflect the actual costs here, which largely have to do with the PHI->select transform rather than the actual arithmetic instructions.

Two phi nodes are replaced with select instructions in static bool FoldTwoEntryPHINode (which is invoked after mergeConditionalStoreToAddress)

while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
  // Change the PHI node into a select instruction.
  Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
  Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);

  Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
  PN->replaceAllUsesWith(Sel);
  Sel->takeName(PN);
  PN->eraseFromParent();
}

So it is needed to add some additional analysis here? I mean - FoldTwoEntryPHINode should make some checks about expediency of this folding?

This needs a testcase (check-instr-cost-for-folding.ll isn't it.)
Can you see if D67315 already solves whatever issue you are seeing?

I checked D67315 , unfortunately it does not help, generated code is the same.

Simplify test case. Relevant C code is:

typedef struct {
  int * dummy;
  void *proc;
} ptr_wrapper;

ptr_wrapper * fds;

int do_select(const unsigned long in, 
              unsigned long bit,
              const unsigned long mask)
{
  int retval = 0;
  unsigned long res_in = 0;

  for( ; bit != 0; bit <<= 1)
  {
  	if (in & bit) {
  		res_in |= bit;
  		retval++;
  		fds->proc = NULL;
  	}
  	
  	if (mask & 0x1) {
  		fds->proc = NULL;
  	}
  }

  return retval + res_in;
}

Rework patch. Allow merge store but bring additional check to FoldTwoEntryPHINode: before hoisting all instructions from IfBlock0 and IfBlock1 to DomBlock we check number of instructions in this DomBlock.

Please upload patches with full context (-U99999)
Patches should be as compared to the trunk, not previous patch.

lebedev.ri requested changes to this revision.Sep 11 2019, 3:57 AM

This revision now requires changes to proceed.Sep 11 2019, 3:57 AM

Yes, sure, will upload it in 24 hours, thank you!

I need to change my patch entirely, it needs discussion in mailing list and it will take time, so at the moment I will close this revision and after some time create another one.
Thanks to all.

I really wonder if the SimplifyCFG behavior is correct and you want to instead be tuning the undo transform in backend..

Yes, I think it is possible, that this code should be changed later in backend. It needs more investigation )

Herald added a subscriber: dmgreen. · View Herald TranscriptSep 16 2019, 6:53 AM

kpdev42 reclaimed this revision.Nov 20 2019, 4:51 AM

This revision now requires changes to proceed.Nov 20 2019, 4:51 AM

I changed my patch, now before merging blocks it calculates existing instructions in DomBB,
just to be sure that this block is not too big already
Maybe instead of using phi-node-folding threshold it is needed to use some other separate threshold.

And also it is still not clear, if we need to optimize this in a middle end or in a backend.
Maybe backend will be more proper place for it. @dmgreen please feel free to bring a light to this problem

P.S.: In this change I also simplified a testcase

Recall, what exactly I trying to eliminate:
clang merges too big basic blocks and it leads to performance regression

Before merging:

        tst     x18, x1
        b.eq    .LBB0_10
.LBB0_9:
        orr     x16, x16, x18
        add     w0, w0, #1
        str     xzr, [x13, #56]
.LBB0_10:
        cbz     x11, .LBB0_7

After merging

and     x2, x18, x1
orr     x3, x16, x18
tst     x18, x1
orr     x2, x2, x11
cinc    w0, w0, ne
csel    x16, x16, x3, eq
cbz     x2, .LBB0_7

Similar picture in test case:
Before merging:

for.body.lr.ph:                                   ; preds = %entry
  %0 = load %struct.ptr_wrapper*, %struct.ptr_wrapper** @g_wrapper, align 8
  %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %0, i64 0, i32 1
  %and2 = and i64 %mask, 1
  %tobool3 = icmp eq i64 %and2, 0
  %and = and i64 %bit, %in
  %tobool = icmp eq i64 %and, 0
  br i1 %tobool, label %if.end, label %if.then

if.then:                                          ; preds = %for.body.lr.ph
  %or = or i64 0, %bit
  %inc = add nsw i32 0, 1
  br label %if.end

if.end:                                           ; preds = %if.then, %for.body.lr.ph
  %retval1.1 = phi i32 [ %inc, %if.then ], [ 0, %for.body.lr.ph ]
  %res_in.1 = phi i64 [ %or, %if.then ], [ 0, %for.body.lr.ph ]
  %1 = xor i1 %tobool, true
  %2 = xor i1 %tobool3, true
  %3 = or i1 %1, %2
  store i8* null, i8** %proc, align 8
  %shl = shl i64 %bit, 1
  %cmp = icmp eq i64 %shl, 0
  br label %for.end

After merging:

for.body.lr.ph:                                   ; preds = %entry
  %0 = load %struct.ptr_wrapper*, %struct.ptr_wrapper** @g_wrapper, align 8
  %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %0, i64 0, i32 1
  %and2 = and i64 %mask, 1
  %tobool3 = icmp eq i64 %and2, 0
  %and = and i64 %bit, %in
  %tobool = icmp eq i64 %and, 0
  %or = or i64 0, %bit
  %inc = add nsw i32 0, 1
  %retval1.1 = select i1 %tobool, i32 0, i32 %inc
  %res_in.1 = select i1 %tobool, i64 0, i64 %or
  %1 = xor i1 %tobool, true
  %2 = xor i1 %tobool3, true
  %3 = or i1 %1, %2
  store i8* null, i8** %proc, align 8
  %shl = shl i64 %bit, 1
  %cmp = icmp eq i64 %shl, 0
  br label %for.end

lebedev.ri added inline comments.Nov 20 2019, 5:33 AM

llvm/lib/Transforms/Utils/SimplifyCFG.cpp
2428	Not clang-formatted
llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
26–30	I think this testcase is overreduced, which makes it impossible to actually guess what the fix should be. https://godbolt.org/z/sxmNMd

Fix formatting in SimplifyCFG.cpp and remaster testcase (I reduced it more accurately to avoid overreducing)
Please @lebedev.ri take a look

On X86 branch misprediction is ~10..~20 cycles,
Is it dirt cheap on AArch64? (any number?)
Perhaps we need to redefine the threshold in terms of branch misprediction cost?

llvm/lib/Transforms/Utils/SimplifyCFG.cpp
2420–2423	So in other words we'd only perform the fold only if the preceding block is not larger than what we'd add via folding. In other words if we are okay flatteing 4-instruction 2-entry PHI, the dominating BB must contain less than 4 instructions. That seems awfully hand-wavy to me, i'm afraid :( It will make the fold not happen in all the cases i'm aware of.

In D67281#1756463, @lebedev.ri wrote:

On X86 branch misprediction is ~10..~20 cycles,

Very interesting, I didn't measure cycles of misprediction. Only time of execution.

Is it dirt cheap on AArch64? (any number?)

Excuse me, I didn't get the question :) You asked about number of branch misprediction cycles?

Perhaps we need to redefine the threshold in terms of branch misprediction cost?

It is actually sounds very promising. But at the moment I do not know where to get this cost :) Is it supposed to be in processor description (e.g. for Cortex-75 -https://developer.arm.com/docs/100403/0301 ) ?

And yet another thought: maybe we will just compare execution latency / throughput for merged and non-merged variants and will choose the variant with the smallest total value?

For example (all data about latency/throughput is taken from https://developer.arm.com/docs/101398/0200/arm-cortex-a75-software-optimization-guide-v20 / code taken from https://bugs.llvm.org/show_bug.cgi?id=43205 ):

Non-merged variant
                            | Execution Latency | Execution Throughput
---------------------------------------------------------------------
tst     x18, x1             | 1                 | 2
b.eq    .LBB0_10            | 1                 | 1
.LBB0_9:                    |                   | 
orr     x16, x16, x18       | 1                 | 2
add     w0, w0, #1          | 1                 | 2
str     xzr, [x13, #56]     | 1                 | 1
.LBB0_10:                   |                   | 
cbz     x11, .LBB0_7        | 1                 | 1
str	xzr, [x13,#56]          | 1                 | 1
---------------------------------------------------------------------
Total:                      | 7                 | 10

Merged variant
                            | Execution Latency | Execution Throughput
---------------------------------------------------------------------
and	x3, x2, x1              | 1                 | 2
tst	x2, x1                  | 1                 | 2
orr	x5, x11, x3             | 1                 | 2
cinc	w0, w0, ne          | 1                 | 2
csel	x3, xzr, x2, eq     | 1                 | 2
cbz	x5, .LBB0_7             | 1                 | 1
str	xzr, [x13,#56]          | 1                 | 1
orr	x16, x16, x3            | 1                 | 2
---------------------------------------------------------------------
Total:                      | 8                 | 14

In case above non-merged variant is better.

Is it valid approach?

llvm/lib/Transforms/Utils/SimplifyCFG.cpp
2420–2423	Yes, agree, using this comparison is a hand-wave as it is :) if (Cost > PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) { So we need a separate threshold here. I will change it today

I have added check for additional instructions which will be generated after select lowering (In lambda IsWorthwile in mergeConditionalStoreToAddress)
Update test.

I think it could be useful, if I add my small runtime benchmark here:

small_benchmark.c3 KBDownload

According to this benchmark patched version better than original for about 2 times

Compiled with flags: clang -target aarch64-linux-android -O2
Device with CortexA76

To = execution time of original version (average for 1000 iterations)
Tp = execution time of patched version (average for 1000 iterations)

                          To / Tp 
Match all branches.       2.11
Match no branches.        2.12
Match IN branch only.     2.12
Match OUT branch only.    2.11
Match EX branch only.     2.07
Match IN & OUT branches.  2.13
Match IN & EX branches.   2.15
Match OUT & EX branches.  2.08

I'm sorry.
I'm sure the problem this is trying to address is real, but i don't have any suggestions how it should be fixed.
I somewhat think that the current branch->select logic is still too weak, and this problem should be fixed in backend.

kpdev42 marked 2 inline comments as done.Oct 25 2021, 1:54 AM

kpdev42 abandoned this revision.Dec 23 2022, 5:34 AM

Herald added a project: Restricted Project. · View Herald TranscriptDec 23 2022, 5:34 AM

Revision Contents

Path

Size

llvm/

include/

llvm/

Analysis/

TargetTransformInfo.h

13 lines

TargetTransformInfoImpl.h

5 lines

lib/

Analysis/

TargetTransformInfo.cpp

4 lines

Target/

AArch64/

AArch64TargetTransformInfo.h

11 lines

Transforms/

Utils/

SimplifyCFG.cpp

28 lines

test/

Transforms/

SimplifyCFG/

AArch64/

check-instr-cost-for-folding.ll

104 lines

Diff 219108

llvm/include/llvm/Analysis/TargetTransformInfo.h

Context not available.
	/// incurs significant execution cost.	/// incurs significant execution cost.
	bool isLoweredToCall(const Function *F) const;	bool isLoweredToCall(const Function *F) const;

		///
		/// \brief isExpensiveForFolding check if instruction can be replaced
		/// with conditional instruction (csel/cset etc), if not - then instruction is
		/// considered as expensive
		/// \param I - instructions for checking
		/// \return true if it cannot be replaced with one conditional instruction
		bool isExpensiveForFolding(const Instruction& I) const;

	struct LSRCost {	struct LSRCost {
	/// TODO: Some of these could be merged. Also, a lexical ordering	/// TODO: Some of these could be merged. Also, a lexical ordering
	/// isn't always optimal.	/// isn't always optimal.
Context not available.
	Intrinsic::ID IID) const = 0;	Intrinsic::ID IID) const = 0;
	virtual bool rewriteIntrinsicWithAddressSpace(	virtual bool rewriteIntrinsicWithAddressSpace(
	IntrinsicInst II, Value OldV, Value *NewV) const = 0;	IntrinsicInst II, Value OldV, Value *NewV) const = 0;
		virtual bool isExpensiveForFolding(const Instruction& I) const = 0;
	virtual bool isLoweredToCall(const Function *F) = 0;	virtual bool isLoweredToCall(const Function *F) = 0;
	virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,	virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
	UnrollingPreferences &UP) = 0;	UnrollingPreferences &UP) = 0;
Context not available.
	return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);	return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
	}	}

		bool isExpensiveForFolding(const Instruction& I) const override {
		return Impl.isExpensiveForFolding(I);
		}

	bool isLoweredToCall(const Function *F) override {	bool isLoweredToCall(const Function *F) override {
	return Impl.isLoweredToCall(F);	return Impl.isLoweredToCall(F);
	}	}
Context not available.

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Context not available.
	return false;	return false;
	}	}

		bool isExpensiveForFolding(const Instruction& I) const {
		(void)I;
		return false;
		}

	bool isLoweredToCall(const Function *F) {	bool isLoweredToCall(const Function *F) {
	assert(F && "A concrete function must be provided to this routine.");	assert(F && "A concrete function must be provided to this routine.");

Context not available.

llvm/lib/Analysis/TargetTransformInfo.cpp

Context not available.
	return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);	return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
	}	}

		bool TargetTransformInfo::isExpensiveForFolding(const Instruction& I) const {
		return TTIImpl->isExpensiveForFolding(I);
		}

	void TargetTransformInfo::getUnrollingPreferences(	void TargetTransformInfo::getUnrollingPreferences(
	Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {	Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
	return TTIImpl->getUnrollingPreferences(L, SE, UP);	return TTIImpl->getUnrollingPreferences(L, SE, UP);
Context not available.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Context not available.
	return 2;	return 2;
	}	}

		bool isExpensiveForFolding(const Instruction& I) const {
		switch (I.getOpcode()) {
		case Instruction::Or:
		case Instruction::And:
		case Instruction::Xor:
		efriedmaUnsubmitted Not Done Reply Inline Actions This makes no sense; integer logic should be cheap. efriedma: This makes no sense; integer logic should be cheap.
		return true;
		default:
		return false;
		}
		}

	bool useReductionIntrinsic(unsigned Opcode, Type *Ty,	bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
	TTI::ReductionFlags Flags) const;	TTI::ReductionFlags Flags) const;

Context not available.

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

Context not available.
	cl::desc("Limit maximum recursion depth when calculating costs of "	cl::desc("Limit maximum recursion depth when calculating costs of "
	"speculatively executed instructions"));	"speculatively executed instructions"));

		static cl::opt<bool> CheckSpeculationCost(
		"check-speculation-cost", cl::Hidden, cl::init(true),
		cl::desc("When merging conditional stores, add additional cost to instructions "
		"which cannot be represented by one conditional instruction"));

	STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");	STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
	STATISTIC(NumLinearMaps,	STATISTIC(NumLinearMaps,
	"Number of switch instructions turned into linear mapping");	"Number of switch instructions turned into linear mapping");
		lebedev.riUnsubmitted Done Reply Inline Actions Not clang-formatted lebedev.ri: Not clang-formatted
		lebedev.riUnsubmitted Done Reply Inline Actions So in other words we'd only perform the fold only if the preceding block is not larger than what we'd add via folding. In other words if we are okay flatteing 4-instruction 2-entry PHI, the dominating BB must contain less than 4 instructions. That seems awfully hand-wavy to me, i'm afraid :( It will make the fold not happen in all the cases i'm aware of. lebedev.ri: So in other words we'd only perform the fold only if the preceding block is not larger than…
		kpdev42AuthorUnsubmitted Done Reply Inline Actions Yes, agree, using this comparison is a hand-wave as it is :) if (Cost > PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) { So we need a separate threshold here. I will change it today kpdev42: Yes, agree, using this comparison is a hand-wave as it is :) ``` if (Cost >…
Context not available.
	BasicBlock QTB, BasicBlock QFB,	BasicBlock QTB, BasicBlock QFB,
	BasicBlock PostBB, Value Address,	BasicBlock PostBB, Value Address,
	bool InvertPCond, bool InvertQCond,	bool InvertPCond, bool InvertQCond,
	const DataLayout &DL) {	const DataLayout &DL,
		const TargetTransformInfo &TTI) {
	auto IsaBitcastOfPointerType = [](const Instruction &I) {	auto IsaBitcastOfPointerType = [](const Instruction &I) {
	return Operator::getOpcode(&I) == Instruction::BitCast &&	return Operator::getOpcode(&I) == Instruction::BitCast &&
	I.getType()->isPointerTy();	I.getType()->isPointerTy();
Context not available.
	for (auto &I : BB->instructionsWithoutDebug()) {	for (auto &I : BB->instructionsWithoutDebug()) {
	// Cheap instructions viable for folding.	// Cheap instructions viable for folding.
	if (isa<BinaryOperator>(I) \|\| isa<GetElementPtrInst>(I) \|\|	if (isa<BinaryOperator>(I) \|\| isa<GetElementPtrInst>(I) \|\|
	isa<StoreInst>(I))	isa<StoreInst>(I)) {
	++N;	++N;

		if (CheckSpeculationCost && TTI.isExpensiveForFolding(I)) {
		++N;
		}
		}

	// Free instructions.	// Free instructions.
	else if (I.isTerminator() \|\| IsaBitcastOfPointerType(I))	else if (I.isTerminator() \|\| IsaBitcastOfPointerType(I))
	continue;	continue;
Context not available.
	}	}

	static bool mergeConditionalStores(BranchInst PBI, BranchInst QBI,	static bool mergeConditionalStores(BranchInst PBI, BranchInst QBI,
	const DataLayout &DL) {	const DataLayout &DL, const TargetTransformInfo &TTI) {
	// The intention here is to find diamonds or triangles (see below) where each	// The intention here is to find diamonds or triangles (see below) where each
	// conditional block contains a store to the same address. Both of these	// conditional block contains a store to the same address. Both of these
	// stores are conditional, so they can't be unconditionally sunk. But it may	// stores are conditional, so they can't be unconditionally sunk. But it may
Context not available.
	bool Changed = false;	bool Changed = false;
	for (auto *Address : CommonAddresses)	for (auto *Address : CommonAddresses)
	Changed \|= mergeConditionalStoreToAddress(	Changed \|= mergeConditionalStoreToAddress(
	PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL);	PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL, TTI);
	return Changed;	return Changed;
	}	}

Context not available.
	/// that PBI and BI are both conditional branches, and BI is in one of the	/// that PBI and BI are both conditional branches, and BI is in one of the
	/// successor blocks of PBI - PBI branches to BI.	/// successor blocks of PBI - PBI branches to BI.
	static bool SimplifyCondBranchToCondBranch(BranchInst PBI, BranchInst BI,	static bool SimplifyCondBranchToCondBranch(BranchInst PBI, BranchInst BI,
	const DataLayout &DL) {	const DataLayout &DL, const TargetTransformInfo& TTI) {
	assert(PBI->isConditional() && BI->isConditional());	assert(PBI->isConditional() && BI->isConditional());
	BasicBlock *BB = BI->getParent();	BasicBlock *BB = BI->getParent();

Context not available.
	// If both branches are conditional and both contain stores to the same	// If both branches are conditional and both contain stores to the same
	// address, remove the stores from the conditionals and create a conditional	// address, remove the stores from the conditionals and create a conditional
	// merged store at the end.	// merged store at the end.
	if (MergeCondStores && mergeConditionalStores(PBI, BI, DL))	if (MergeCondStores && mergeConditionalStores(PBI, BI, DL, TTI))
	return true;	return true;

	// If this is a conditional branch in an empty block, and if any	// If this is a conditional branch in an empty block, and if any
Context not available.
	for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)	for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
	if (BranchInst PBI = dyn_cast<BranchInst>((PI)->getTerminator()))	if (BranchInst PBI = dyn_cast<BranchInst>((PI)->getTerminator()))
	if (PBI != BI && PBI->isConditional())	if (PBI != BI && PBI->isConditional())
	if (SimplifyCondBranchToCondBranch(PBI, BI, DL))	if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI))
	return requestResimplify();	return requestResimplify();

	// Look for diamond patterns.	// Look for diamond patterns.
Context not available.
	if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))	if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
	if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))	if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
	if (PBI != BI && PBI->isConditional())	if (PBI != BI && PBI->isConditional())
	if (mergeConditionalStores(PBI, BI, DL))	if (mergeConditionalStores(PBI, BI, DL, TTI))
	return requestResimplify();	return requestResimplify();

	return false;	return false;
Context not available.

llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll

This file was added.

				; RUN: opt < %s -mtriple=aarch64-linux-gnu -simplifycfg -check-speculation-cost=true -S >%t
				; RUN: FileCheck %s < %t
				efriedmaUnsubmitted Not Done Reply Inline Actions This testcase is way too complicated given the change you're making. efriedma: This testcase is way too complicated given the change you're making.
				; ModuleID = 'do_select.c'

				%struct.fd_set_bits = type { i64, i64, i64, i64, i64, i64, i32, i8* }

				@g_max_i = common dso_local global i64 0, align 8
				@gv_fds = common dso_local global %struct.fd_set_bits* null, align 8

				; Function Attrs: nofree noinline norecurse nounwind
				define dso_local i32 @do_select(i32 %max_iters_count, i64 %in, i64 %out, i64 %ex, i64 %bit_init_val, i64 %mask) local_unnamed_addr #0 {
				entry:
				%cmp52 = icmp sgt i32 %max_iters_count, 0
				br i1 %cmp52, label %for.body.lr.ph, label %for.cond.cleanup

				for.body.lr.ph: ; preds = %entry
				%and13 = and i64 %mask, 780
				%tobool14 = icmp eq i64 %and13, 0
				br label %for.body

				for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup5
				%retval1.1.lcssa.lcssa = phi i32 [ %retval1.1.lcssa, %for.cond.cleanup5 ]
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
				%retval1.0.lcssa = phi i32 [ 0, %entry ], [ %retval1.1.lcssa.lcssa, %for.cond.cleanup.loopexit ]
				ret i32 %retval1.0.lcssa

				for.body: ; preds = %for.cond.cleanup5, %for.body.lr.ph
				%retval1.054 = phi i32 [ 0, %for.body.lr.ph ], [ %retval1.1.lcssa, %for.cond.cleanup5 ]
				lebedev.riUnsubmitted Not Done Reply Inline Actions I think this testcase is overreduced, which makes it impossible to actually guess what the fix should be. https://godbolt.org/z/sxmNMd lebedev.ri: I think this testcase is overreduced, which makes it impossible to actually guess what the fix…
				%k.053 = phi i32 [ 0, %for.body.lr.ph ], [ %inc23, %for.cond.cleanup5 ]
				%0 = load volatile %struct.fd_set_bits, %struct.fd_set_bits* @gv_fds, align 8
				%res_in = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0, i64 0, i32 3
				%1 = load i64, i64* %res_in, align 8
				%2 = load volatile i64, i64* @g_max_i, align 8
				%cmp348 = icmp eq i64 %2, 0
				br i1 %cmp348, label %for.cond.cleanup5, label %for.cond8.preheader.lr.ph

				for.cond8.preheader.lr.ph: ; preds = %for.body
				%proc = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0, i64 0, i32 7
				br label %for.cond8.preheader

				for.cond8.preheader: ; preds = %for.cond8.preheader.lr.ph, %for.cond.cleanup11
				%indvars.iv = phi i64 [ 0, %for.cond8.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup11 ]
				%rinp.050 = phi i64* [ %1, %for.cond8.preheader.lr.ph ], [ %incdec.ptr, %for.cond.cleanup11 ]
				%retval1.149 = phi i32 [ %retval1.054, %for.cond8.preheader.lr.ph ], [ %retval1.3.lcssa, %for.cond.cleanup11 ]
				br label %for.body12

				for.cond.cleanup5.loopexit: ; preds = %for.cond.cleanup11
				%retval1.3.lcssa.lcssa = phi i32 [ %retval1.3.lcssa, %for.cond.cleanup11 ]
				br label %for.cond.cleanup5

				for.cond.cleanup5: ; preds = %for.cond.cleanup5.loopexit, %for.body
				%retval1.1.lcssa = phi i32 [ %retval1.054, %for.body ], [ %retval1.3.lcssa.lcssa, %for.cond.cleanup5.loopexit ]
				%inc23 = add nuw nsw i32 %k.053, 1
				%exitcond56 = icmp eq i32 %inc23, %max_iters_count
				br i1 %exitcond56, label %for.cond.cleanup.loopexit, label %for.body

				for.cond.cleanup11: ; preds = %for.inc
				%retval1.3.lcssa = phi i32 [ %retval1.3, %for.inc ]
				%res_in7.1.lcssa = phi i64 [ %res_in7.1, %for.inc ]
				store i64 %res_in7.1.lcssa, i64* %rinp.050, align 8
				%indvars.iv.next = add nuw i64 %indvars.iv, 1
				%incdec.ptr = getelementptr inbounds i64, i64* %rinp.050, i64 1
				%3 = load volatile i64, i64* @g_max_i, align 8
				%cmp3 = icmp ugt i64 %3, %indvars.iv.next
				br i1 %cmp3, label %for.cond8.preheader, label %for.cond.cleanup5.loopexit

				for.body12: ; preds = %for.inc, %for.cond8.preheader
				%j.047 = phi i32 [ 0, %for.cond8.preheader ], [ %inc18, %for.inc ]
				%res_in7.046 = phi i64 [ 0, %for.cond8.preheader ], [ %res_in7.1, %for.inc ]
				%bit.044 = phi i64 [ %bit_init_val, %for.cond8.preheader ], [ %shl, %for.inc ]
				%retval1.243 = phi i32 [ %retval1.149, %for.cond8.preheader ], [ %retval1.3, %for.inc ]
				%and = and i64 %bit.044, %in
				%tobool = icmp eq i64 %and, 0
				br i1 %tobool, label %if.end, label %if.then

				if.then: ; preds = %for.body12
				; CHECK-LABEL: if.then:
				; CHECK-NEXT: %or = or i64 %res_in7.046, %bit.044
				; CHECK-NEXT: %inc = add nsw i32 %retval1.243, 1
				; CHECK-NEXT: store i8* null, i8** %proc, align 8
				%or = or i64 %res_in7.046, %bit.044
				%inc = add nsw i32 %retval1.243, 1
				store i8* null, i8** %proc, align 8
				br label %if.end

				if.end: ; preds = %for.body12, %if.then
				%retval1.3 = phi i32 [ %inc, %if.then ], [ %retval1.243, %for.body12 ]
				%res_in7.1 = phi i64 [ %or, %if.then ], [ %res_in7.046, %for.body12 ]
				br i1 %tobool14, label %for.inc, label %if.then15

				if.then15: ; preds = %if.end
				; CHECK-LABEL: if.then15:
				; CHECK-NEXT: store i8* null, i8** %proc, align 8
				store i8* null, i8** %proc, align 8
				br label %for.inc

				for.inc: ; preds = %if.end, %if.then15
				%inc18 = add nuw nsw i32 %j.047, 1
				%shl = shl i64 %bit.044, 1
				%exitcond = icmp eq i32 %inc18, 64
				br i1 %exitcond, label %for.cond.cleanup11, label %for.body12
				}