This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
include/llvm/
-
llvm/
-
Analysis/
-
TargetTransformInfo.h
-
TargetTransformInfoImpl.h
-
CodeGen/
-
BasicTTIImpl.h
-
lib/
-
Analysis/
-
TargetTransformInfo.cpp
-
Transforms/Utils/
-
Utils/
20/25
SimplifyCFG.cpp
-
test/Transforms/SimplifyCFG/X86/
-
Transforms/
-
SimplifyCFG/
-
X86/
-
if-conversion.ll

Differential D39352

[SimplifyCFG] Don't do if-conversion if there is a long dependence chain
ClosedPublic

Authored by Carrot on Oct 26 2017, 4:24 PM.

Download Raw Diff

Details

Reviewers

hfinkel
iteratee

Commits

rG33250340f4dc: [SimplifyCFG] Don't do if-conversion if there is a long dependence chain
rG433e8d3e0428: [PPC] Change i32 constant in store instruction to i64
rL321377: [SimplifyCFG] Don't do if-conversion if there is a long dependence chain
rL318436: [PPC] Change i32 constant in store instruction to i64

Summary

The motivated test case is

struct ICmp {

bool operator()(int *a, int *b) const { return *a < *b; }

};

typedef set<int *, ICmp> PSet;

void foo(PSet* pset, int* l) {

pset->insert(l);

}

The actual hot code is in function _M_get_insert_unique_pos

 while (__x != 0)
{
  __y = __x; 
  __comp = _M_impl._M_key_compare(__k, _S_key(__x));
  __x = __comp ? _S_left(__x) : _S_right(__x);
}

LLVM generates following code:

.LBB1_2:

                                
movq    %rdx, %rbx
movq    32(%rbx), %rcx
movl    (%rcx), %ecx
leaq    24(%rbx), %rdx
leaq    16(%rbx), %rsi
cmpl    %ecx, %eax
cmovlq  %rsi, %rdx
movq    (%rdx), %rdx
testq   %rdx, %rdx
jne     .LBB1_2

GCC generates:

7.53 │4137d0:   mov    0x10(%rbx),%rax                                   
0.01 │4137d4:   mov    $0x1,%edi                                               
     │4137d9:   test   %rax,%rax                                                                         
0.95 │4137dc: ↓ je     4137f7 <BM_CallF(testing::benchmark::State&)+0xd7>     
            
1.18 │4137de:   mov    %rax,%rbx

40.92 │4137e1: mov 0x20(%rbx),%rax
34.82 │4137e5: mov (%rax),%ecx

     │4137e7:   cmp    %r14d,%ecx                                                                    
1.83 │4137ea: ↑ jg     4137d0 <BM_CallF(testing::benchmark::State&)+0xb0>  
           
8.05 │4137ec:   mov    0x18(%rbx),%rax                                   
0.01 │4137f0:   xor    %edi,%edi                               
     │4137f2:   test   %rax,%rax                                                   
0.50 │4137f5: ↑ jne    4137de <BM_CallF(testing::benchmark::State&)+0xbe>

The gcc generated code is 15% faster. The reason is in llvm generated code, there is a long dependence chain including cmov and later instructions, all of them must be executed one by one. In gcc generated code, the instructions after branch don't have data dependence on instructions before branch, so they can be executed in parallel. Even if the branch is highly unpredictable(like visiting RBTree in this case), if the dependence chain is long enough, the correctly predicted execution can still bring overall performance improvement.

We have observed several internal applications benefited from this optimization. All of them visit tree like data structures.

Before if-conversion, this patch finds out the length of the dependence chain (only the part that can be executed in parallel with code after branch) before cmp, if the length is longer than a threshold, then give up if-conversion.

With this patch, now I can get

// -O2
.LBB1_3:

                              
movq    %rdx, %rbx
movq    32(%rbx), %rcx
movl    (%rcx), %ecx
cmpl    %ecx, %eax
jge     .LBB1_5

BB#4:

                                
leaq    16(%rbx), %rdx
jmp     .LBB1_6
.p2align        4, 0x90

.LBB1_5:

                                
leaq    24(%rbx), %rdx

.LBB1_6:

                                
movq    (%rdx), %rdx
testq   %rdx, %rdx
jne     .LBB1_3

// -O3
.LBB1_3:

                             
movq    %rdx, %rbx
movq    32(%rbx), %rcx
movl    (%rcx), %ecx
cmpl    %ecx, %eax
jge     .LBB1_5

BB#4:

leaq    16(%rbx), %rdx
movq    (%rdx), %rdx
testq   %rdx, %rdx
jne     .LBB1_3
jmp     .LBB1_7
.p2align        4, 0x90

.LBB1_5:

                                
leaq    24(%rbx), %rdx
movq    (%rdx), %rdx
testq   %rdx, %rdx
jne     .LBB1_3

Still worse than gcc generated code, but those are separate issues (code layout, extra lea), and it is already faster than cmov version.

Diff Detail

Event Timeline

Carrot created this revision.Oct 26 2017, 4:24 PM

Don't know why llvm generated code formatted like a mess. Try again.

The original llvm generated code:

0.49 │40e200:   mov    %rcx,%rbx

45.49 │40e203: mov 0x20(%rbx),%rcx
29.17 │40e207: mov (%rcx),%edx

0.06 │40e209:   lea    0x18(%rbx),%rcx                                                                
     │40e20d:   lea    0x10(%rbx),%rsi                                                                 
1.95 │40e211:   cmp    %edx,%eax                                                                    
3.12 │40e213:   cmovl  %rsi,%rcx

13.21 │40e217: mov (%rcx),%rcx

0.00 │40e21a:   test   %rcx,%rcx                                                  
1.57 │40e21d: ↑ jne    40e200 <std::pair<std::_Rb_tree_iterator<Lock*>, bool> std::_Rb_tree<Lock*, Lock*, std::_Identity<Lock*>, LCmp, std::allocator<Lock*> >::_M_insert_unique<Lock* const&>(Lock* const&)+0x30>

The llvm generated code with this patch

// -O2
.LBB1_3:

movq    %rdx, %rbx
movq    32(%rbx), %rcx
movl    (%rcx), %ecx
cmpl    %ecx, %eax
jge     .LBB1_5

leaq    16(%rbx), %rdx
jmp     .LBB1_6
.p2align        4, 0x90

.LBB1_5:

leaq    24(%rbx), %rdx

.LBB1_6:

movq    (%rdx), %rdx
testq   %rdx, %rdx
jne     .LBB1_3

// -O3
.LBB1_3:

movq    %rdx, %rbx
movq    32(%rbx), %rcx
movl    (%rcx), %ecx
cmpl    %ecx, %eax
jge     .LBB1_5

leaq    16(%rbx), %rdx
movq    (%rdx), %rdx
testq   %rdx, %rdx
jne     .LBB1_3
jmp     .LBB1_7
.p2align        4, 0x90

.LBB1_5:

leaq    24(%rbx), %rdx
movq    (%rdx), %rdx
testq   %rdx, %rdx
jne     .LBB1_3

.LBB1_7:

Carrot added reviewers: hfinkel, iteratee.Nov 2 2017, 11:04 AM

ping

Closed by commit rL318436: [PPC] Change i32 constant in store instruction to i64 (authored by Carrot). · Explain WhyNov 16 2017, 10:27 AM

This revision was automatically updated to reflect the committed changes.

I unintentionally closed this patch by pasting its url to another svn patch.
It's still waiting for review, so reopen it.

Rebase to head of trunk.

mgrang added a subscriber: mgrang.Nov 20 2017, 4:47 PM

mgrang added inline comments.

lib/Transforms/Utils/SimplifyCFG.cpp
405	Use uppercase variable names. Shouldn't size be unsigned?
413	Shouldn't latency be unsigned?
414	Uppercase variable names.
417	Uppercase variable names.
423	Uppercase variable names.
440	Uppercase variable names. Check if br is not null.
460	You can combine the assignment and check into a single if: if (PHINode *PN = dyn_cast<PHINode>(II))
481	Same here. Can combine the assignment and check into single if.
505	How big are the keys of your map? Can you use llvm's map containers (like DenseMap, etc)?
511	Uppercase variable names.

Carrot updated this revision to Diff 124014.Nov 22 2017, 3:26 PM

Carrot marked 10 inline comments as done.

Carrot added inline comments.

lib/Transforms/Utils/SimplifyCFG.cpp
505	The size of the map is BB.size() - 1, and can't be greater than 40.

Rather than trying to prevent the flattening from happening in the first place, it might make more sense to reverse it later in the pass pipeline. That would be more general, and avoid blocking other optimizations. The x86 backend already does this in some limited cases; see X86CmovConversion.cpp.

In D39352#933324, @efriedma wrote:

Rather than trying to prevent the flattening from happening in the first place, it might make more sense to reverse it later in the pass pipeline. That would be more general, and avoid blocking other optimizations. The x86 backend already does this in some limited cases; see X86CmovConversion.cpp.

Thank you for the alternative method. This optimization should be target independent. It impacts all OOO processors. I tested the same motivated test case on power8, and observed 10% performance difference.

ping

You have a bunch of variable names here with underscores, which is not our usual convention. BB1_chain -> BB1Chain, etc.

lib/Transforms/Utils/SimplifyCFG.cpp
413	Please explicitly note in the comment that the 'or' here is determined by the 'LongestChain' parameter.
497	in new BB -> in this new BB
500	I find this sentence confusing. You're trying to say that the control dependence is faster than the data dependence, because the control dependence can be speculated (and thus, the second part can execute in parallel with the first). Right? This comment should also explain what this is checking. Something like: When considering whether to perform if-conversion, find the length of the dependence chain in BB1 (only the part that can be executed in parallel with code after branch in BB2) before cmp, and if the length is longer than a threshold, don't perform if-conversion." Document what SpeculationSize is.
518	Please add a cl::opt for this threshold.
527	You should comment that you're estimating that if there are lots of other instructions in the new BB, they'll be other instructions for the processor to issue regardless of the length of this new dependence chain.
528	Should this 2 be the SchedModel's IssueWidth?
539	Is there a way to think of the proper value of DependenceChainLatency in terms of the branch-misprediction penalty?

Thanks a lot for the English language correction.

Carrot added inline comments.Dec 15 2017, 10:16 AM

lib/Transforms/Utils/SimplifyCFG.cpp
528	In real world applications, an IPC of 2 is already very good for 3 issue or 4 issue processors. Higher IPC is usually found in programs with small kernel. So I think 2 is more reasonable for most applications.
539	DependenceChainLatency depends on many factors, besides branch misprediction penalty, there are also taken branch latency, branch mis prediction rate, cmov latency.

hfinkel added inline comments.Dec 17 2017, 8:39 AM

lib/Transforms/Utils/SimplifyCFG.cpp
528	In real world applications, an IPC of 2 is already very good for 3 issue or 4 issue processors. Higher IPC is usually found in programs with small kernel. So I think 2 is more reasonable for most applications. Fair enough. As we both know, there are all sorts of issues affecting this, but I agree that for non-loop code with small basic blocks, this is likely true. Please mention this assumption in the comment.
539	Please add this to the comment.

Carrot updated this revision to Diff 127423.Dec 18 2017, 3:21 PM

Carrot marked 2 inline comments as done.

LGTM, however, the testing here is fairly light. We should have some cases where we do perform the if conversion because of the DependenceChainLatency check, where we do perform the if conversion because of the IPC check, and because of the BB size check. Also, where the if-converted block has some non-trivial adjustment because of LatencyAdjustment.

lib/Transforms/Utils/SimplifyCFG.cpp
414	the compare instruction. -> the compare instruction feeding the block's conditional branch.
501	... because the data dependence is changed into control dependence, and ... This is backward. It should say, "... because the control dependence is transformed into a data dependence, and the control dependence can be speculated, and ...".
527	in small BB. -> in a small BB.

This revision is now accepted and ready to land.Dec 19 2017, 9:10 PM

hfinkel mentioned this in D41361: [SimplifyCFG] Avoid quadratic on a predecessors number behavior in instruction sinking..Dec 20 2017, 4:10 PM

Add more test cases.
Will check in this version.

Closed by commit rL321377: [SimplifyCFG] Don't do if-conversion if there is a long dependence chain (authored by Carrot). · Explain WhyDec 22 2017, 10:55 AM

This revision was automatically updated to reflect the committed changes.

I don't think this patch's approach is the right one, and we should probably pursue Eli's suggestion.... It also causes very significant regressions in benchmarks, so I think we should revert it for now.

The core problem I see here is that we're pretty radically changing the canonical form of the IR. This is going to be really disruptive. Doing this so early on means that we can't recognize common, direct patterns in terms of select. While perhaps it would be better to go and teach everything to generically reason about either a select or a phi around control flow, that will need a really massive undertaking.

And this isn't just a theoretical problem. This patch regresses performance on simple code patterns like counting things in sets by a huge amount because there, we were able to do things like use the adc instruction or other dedicated hardware:
https://reviews.llvm.org/P8055 or

#include <bitset>
#include <chrono>
#include <iostream>
#include <random>
#include <vector>

int main() {
  std::bitset<256> set;
  volatile unsigned seed = 0;
  std::mt19937 rng(seed);
  for (int i = 0; i < 256; ++i)
    set[i] = rng() < (rng.max() / 2) ? false : true;

  volatile unsigned size = 4096;
  std::vector<unsigned char> data;
  for (int i = 0; i < size; ++i)
    data.push_back(rng() & 0xFF);

  volatile int iterations = 100000;
  long int true_count = 0, all_count = 0;

  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < iterations; ++i)
    for (auto c : data) {
      if (set[c])
        ++true_count;
      ++all_count;
    }
  auto stop = std::chrono::high_resolution_clock::now();

  std::cout << "Did " << iterations << " iterations over " << size
            << " queries: " << true_count << " hits of " << all_count << "\n";
  std::cout << "Took "
            << std::chrono::duration<double, std::nano>(stop - start).count() /
                   iterations
            << " nanoseconds per iteration.\n";
}

This benchmark runs over 2x slower for me on both AMD and Intel hardware. That seems like a really huge regression. And I have extracted this out of actual benchmark code we have internally. I think we should probably revert this patch until there is a clear consensus and/or more comprehensive performance data supporting a particular direction here.

Another way of looking at the problem with this approach is that it seems to be trying to solve an issue pretty specifically with cmov. We actually have a dedicated pass that tries *specifically* to reason about the length of dependency chains and the proportional cost of a cmov versus a conditional branch. That seems like a much more appropriate place to handle these things, and that is exactly what Eli pointed you at. I've done some work there as well, and I really think that is a good approach. If other architectures need similar logic, we could try to factor it out and share it between targets, but I think the ideas around what constitutes a long dependency will be somewhat architecture specific. For example, I could easily imagine a predicated architecture where the patterns in this patch don't form long dependency chains.

reverted, will investigate it further.

The performance regression of https://reviews.llvm.org/P8055 is not because of missing adc instruction.

Slow version:

.LBB0_65:

movzbl  (%rdx), %esi
movq    %rsi, %rdi
shrq    $3, %rdi
andl    $24, %edi
movq    32(%rsp,%rdi), %rdi
btq     %rsi, %rdi
jae     .LBB0_67

addq    $1, %rbx

.LBB0_67:

addq    $1, %rdx
cmpq    %rdx, %r13
jne     .LBB0_65

Fast version:

.LBB0_65:

movzbl  (%rdx), %esi
movq    %rsi, %rdi
shrq    $3, %rdi
andl    $24, %edi
movq    32(%rsp,%rdi), %rdi
btq     %rsi, %rdi
adcq    $0, %rbx                                 // result of if conversion
addq    $1, %rdx
cmpq    %rdx, %r13
jne     .LBB0_65

The result of if conversion %rbx is not used in later in BB67, so although there is long dependence chain in BB65, it does not prevent the processor execute instructions in BB67 and later BBs. For the branch version, it doesn't make instructions in BB67 executed earlier, but it bring extra penalty of branch misprediction, so it is slower.

In function FindLongDependenceChain, I should also check the result of if-conversion can really delay other instructions in BB2, if not, like in this case, we should do if-conversion.

Doing this so early on means that we can't recognize common, direct patterns in terms of select. While perhaps it would be better to go and teach everything to generically reason about either a select or a phi around control flow, that will need a really massive undertaking.

FWIW, I believe that the outcome of the select/GVN discussion (e.g., from PR34603) has been that this is exactly what we need to do: we need to move select formation late in the pipeline, and likely canonicalize away from select early.

Revision Contents

Path

Size

include/

llvm/

Analysis/

TargetTransformInfo.h

7 lines

TargetTransformInfoImpl.h

2 lines

CodeGen/

BasicTTIImpl.h

4 lines

lib/

Analysis/

TargetTransformInfo.cpp

4 lines

Transforms/

Utils/

SimplifyCFG.cpp

154 lines

test/

Transforms/

SimplifyCFG/

X86/

if-conversion.ll

56 lines

Diff 123686

include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 634 Lines • ▼ Show 20 Lines	enum OperandValueKind {
OK_UniformValue, // Operand is uniform (splat of a value).		OK_UniformValue, // Operand is uniform (splat of a value).
OK_UniformConstantValue, // Operand is uniform constant.		OK_UniformConstantValue, // Operand is uniform constant.
OK_NonUniformConstantValue // Operand is a non uniform constant value.		OK_NonUniformConstantValue // Operand is a non uniform constant value.
};		};

/// \brief Additional properties of an operand's values.		/// \brief Additional properties of an operand's values.
enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };		enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };

		/// \return True if target can execute instructions out of order.
		bool isOutOfOrder() const;

/// \return The number of scalar or vector registers that the target has.		/// \return The number of scalar or vector registers that the target has.
/// If 'Vectors' is true, it returns the number of vector registers. If it is		/// If 'Vectors' is true, it returns the number of vector registers. If it is
/// set to false, it returns the number of scalar registers.		/// set to false, it returns the number of scalar registers.
unsigned getNumberOfRegisters(bool Vector) const;		unsigned getNumberOfRegisters(bool Vector) const;

/// \return The width of the largest scalar or vector register type.		/// \return The width of the largest scalar or vector register type.
unsigned getRegisterBitWidth(bool Vector) const;		unsigned getRegisterBitWidth(bool Vector) const;

▲ Show 20 Lines • Show All 361 Lines • ▼ Show 20 Lines	public:
virtual int getFPOpCost(Type *Ty) = 0;		virtual int getFPOpCost(Type *Ty) = 0;
virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,		virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
Type *Ty) = 0;		Type *Ty) = 0;
virtual int getIntImmCost(const APInt &Imm, Type *Ty) = 0;		virtual int getIntImmCost(const APInt &Imm, Type *Ty) = 0;
virtual int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,		virtual int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
Type *Ty) = 0;		Type *Ty) = 0;
virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,		virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty) = 0;		Type *Ty) = 0;
		virtual bool isOutOfOrder() const = 0;
virtual unsigned getNumberOfRegisters(bool Vector) = 0;		virtual unsigned getNumberOfRegisters(bool Vector) = 0;
virtual unsigned getRegisterBitWidth(bool Vector) const = 0;		virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
virtual unsigned getMinVectorRegisterBitWidth() = 0;		virtual unsigned getMinVectorRegisterBitWidth() = 0;
virtual bool shouldConsiderAddressTypePromotion(		virtual bool shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;		const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
virtual unsigned getCacheLineSize() = 0;		virtual unsigned getCacheLineSize() = 0;
virtual llvm::Optional<unsigned> getCacheSize(CacheLevel Level) = 0;		virtual llvm::Optional<unsigned> getCacheSize(CacheLevel Level) = 0;
virtual llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) = 0;		virtual llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) = 0;
▲ Show 20 Lines • Show All 257 Lines • ▼ Show 20 Lines	public:
int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,		int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
Type *Ty) override {		Type *Ty) override {
return Impl.getIntImmCost(Opc, Idx, Imm, Ty);		return Impl.getIntImmCost(Opc, Idx, Imm, Ty);
}		}
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,		int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty) override {		Type *Ty) override {
return Impl.getIntImmCost(IID, Idx, Imm, Ty);		return Impl.getIntImmCost(IID, Idx, Imm, Ty);
}		}
		bool isOutOfOrder() const override {
		return Impl.isOutOfOrder();
		}
unsigned getNumberOfRegisters(bool Vector) override {		unsigned getNumberOfRegisters(bool Vector) override {
return Impl.getNumberOfRegisters(Vector);		return Impl.getNumberOfRegisters(Vector);
}		}
unsigned getRegisterBitWidth(bool Vector) const override {		unsigned getRegisterBitWidth(bool Vector) const override {
return Impl.getRegisterBitWidth(Vector);		return Impl.getRegisterBitWidth(Vector);
}		}
unsigned getMinVectorRegisterBitWidth() override {		unsigned getMinVectorRegisterBitWidth() override {
return Impl.getMinVectorRegisterBitWidth();		return Impl.getMinVectorRegisterBitWidth();
▲ Show 20 Lines • Show All 282 Lines • Show Last 20 Lines

include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 329 Lines • ▼ Show 20 Lines	unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
return TTI::TCC_Free;		return TTI::TCC_Free;
}		}

unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,		unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty) {		Type *Ty) {
return TTI::TCC_Free;		return TTI::TCC_Free;
}		}

		bool isOutOfOrder() const { return false; }

unsigned getNumberOfRegisters(bool Vector) { return 8; }		unsigned getNumberOfRegisters(bool Vector) { return 8; }

unsigned getRegisterBitWidth(bool Vector) const { return 32; }		unsigned getRegisterBitWidth(bool Vector) const { return 32; }

unsigned getMinVectorRegisterBitWidth() { return 128; }		unsigned getMinVectorRegisterBitWidth() { return 128; }

bool		bool
shouldConsiderAddressTypePromotion(const Instruction &I,		shouldConsiderAddressTypePromotion(const Instruction &I,
▲ Show 20 Lines • Show All 483 Lines • Show Last 20 Lines

include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 388 Lines • ▼ Show 20 Lines	public:

int getInstructionLatency(const Instruction *I) {		int getInstructionLatency(const Instruction *I) {
if (isa<LoadInst>(I))		if (isa<LoadInst>(I))
return getST()->getSchedModel().DefaultLoadLatency;		return getST()->getSchedModel().DefaultLoadLatency;

return BaseT::getInstructionLatency(I);		return BaseT::getInstructionLatency(I);
}		}

		bool isOutOfOrder() const {
		return getST()->getSchedModel().isOutOfOrder();
		}

/// @}		/// @}

/// \name Vector TTI Implementations		/// \name Vector TTI Implementations
/// @{		/// @{

unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }		unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }

unsigned getRegisterBitWidth(bool Vector) const { return 32; }		unsigned getRegisterBitWidth(bool Vector) const { return 32; }
▲ Show 20 Lines • Show All 894 Lines • Show Last 20 Lines

lib/Analysis/TargetTransformInfo.cpp

	Show First 20 Lines • Show All 309 Lines • ▼ Show 20 Lines

	int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,			int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
	const APInt &Imm, Type *Ty) const {			const APInt &Imm, Type *Ty) const {
	int Cost = TTIImpl->getIntImmCost(IID, Idx, Imm, Ty);			int Cost = TTIImpl->getIntImmCost(IID, Idx, Imm, Ty);
	assert(Cost >= 0 && "TTI should not produce negative costs!");			assert(Cost >= 0 && "TTI should not produce negative costs!");
	return Cost;			return Cost;
	}			}

				bool TargetTransformInfo::isOutOfOrder() const {
				return TTIImpl->isOutOfOrder();
				}

	unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {			unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
	return TTIImpl->getNumberOfRegisters(Vector);			return TTIImpl->getNumberOfRegisters(Vector);
	}			}

	unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {			unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
	return TTIImpl->getRegisterBitWidth(Vector);			return TTIImpl->getRegisterBitWidth(Vector);
	}			}

	▲ Show 20 Lines • Show All 876 Lines • Show Last 20 Lines

lib/Transforms/Utils/SimplifyCFG.cpp

Show First 20 Lines • Show All 121 Lines • ▼ Show 20 Lines	static cl::opt<bool> SpeculateOneExpensiveInst(
cl::desc("Allow exactly one expensive instruction to be speculatively "		cl::desc("Allow exactly one expensive instruction to be speculatively "
"executed"));		"executed"));

static cl::opt<unsigned> MaxSpeculationDepth(		static cl::opt<unsigned> MaxSpeculationDepth(
"max-speculation-depth", cl::Hidden, cl::init(10),		"max-speculation-depth", cl::Hidden, cl::init(10),
cl::desc("Limit maximum recursion depth when calculating costs of "		cl::desc("Limit maximum recursion depth when calculating costs of "
"speculatively executed instructions"));		"speculatively executed instructions"));

		static cl::opt<int> DependenceChainLatency(
		"dependence-chain-latency", cl::Hidden, cl::init(8),
		cl::desc("Limit the maximum latency of dependence chain containing cmp "
		"for if conversion"));

STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");		STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
STATISTIC(NumLinearMaps,		STATISTIC(NumLinearMaps,
"Number of switch instructions turned into linear mapping");		"Number of switch instructions turned into linear mapping");
STATISTIC(NumLookupTables,		STATISTIC(NumLookupTables,
"Number of switch instructions turned into lookup tables");		"Number of switch instructions turned into lookup tables");
STATISTIC(		STATISTIC(
NumLookupTablesHoles,		NumLookupTablesHoles,
"Number of switch instructions turned into lookup tables (holes checked)");		"Number of switch instructions turned into lookup tables (holes checked)");
▲ Show 20 Lines • Show All 252 Lines • ▼ Show 20 Lines	for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI,		if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI,
Depth + 1))		Depth + 1))
return false;		return false;
// Okay, it's safe to do this! Remember this instruction.		// Okay, it's safe to do this! Remember this instruction.
AggressiveInsts->insert(I);		AggressiveInsts->insert(I);
return true;		return true;
}		}

		/// Estimate the code size of the specified BB.
		static int CountBBCodeSize(BasicBlock *BB, const TargetTransformInfo &TTI) {
		int size = 0;
		mgrangUnsubmitted Done Reply Inline Actions Use uppercase variable names. Shouldn't size be unsigned? mgrang: Use uppercase variable names. Shouldn't size be unsigned?
		for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II)
		size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize);
		return size;
		}

		/// Find out the latency of the longest dependence chain in the BB or the
		/// dependence chain containing the compare instruction.
		static int FindDependenceChainLatency(BasicBlock *BB,
		mgrangUnsubmitted Done Reply Inline Actions Shouldn't latency be unsigned? mgrang: Shouldn't latency be unsigned?
		hfinkelUnsubmitted Done Reply Inline Actions Please explicitly note in the comment that the 'or' here is determined by the 'LongestChain' parameter. hfinkel: Please explicitly note in the comment that the 'or' here is determined by the 'LongestChain'…
		std::map<Instruction *, int> &instructions,
		mgrangUnsubmitted Done Reply Inline Actions Uppercase variable names. mgrang: Uppercase variable names.
		hfinkelUnsubmitted Done Reply Inline Actions the compare instruction. -> the compare instruction feeding the block's conditional branch. hfinkel: the compare instruction. -> the compare instruction feeding the block's conditional branch.
		const TargetTransformInfo &TTI,
		bool LongestChain) {
		int max_latency = 0;
		mgrangUnsubmitted Done Reply Inline Actions Uppercase variable names. mgrang: Uppercase variable names.

		BasicBlock::iterator II;
		for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) {
		int latency = 0;
		for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) {
		Instruction *op = dyn_cast<Instruction>(II->getOperand(O));
		mgrangUnsubmitted Done Reply Inline Actions Uppercase variable names. mgrang: Uppercase variable names.
		if (op && instructions.count(op)) {
		auto op_latency = instructions[op];
		if (op_latency > latency)
		latency = op_latency;
		}
		}
		latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency);
		instructions[&(*II)] = latency;

		if (latency > max_latency)
		max_latency = latency;
		}

		if (LongestChain)
		return max_latency;

		BranchInst* br = cast<BranchInst>(II);
		mgrangUnsubmitted Done Reply Inline Actions Uppercase variable names. Check if br is not null. mgrang: Uppercase variable names. Check if br is not null.
		Instruction *cmp = dyn_cast<Instruction>(br->getCondition());
		if (cmp && instructions.count(cmp))
		return instructions[cmp];
		else
		return 0;
		}

		/// Instructions in BB2 may depend on instructions in BB1, and instructions
		/// in BB1 may have users in BB2. If the last (in terms of latency) such kind
		/// of instruction in BB1 is I, then the instructions after I can be executed
		/// in parallel with instructions in BB2.
		/// This function returns the latency of I.
		static int LatencyAdjustment(BasicBlock BB1, BasicBlock BB2,
		BasicBlock IfBlock1, BasicBlock IfBlock2,
		std::map<Instruction *, int> &BB1_instructions) {
		int LastLatency = 0;
		SmallVector<Instruction *, 16> Worklist;
		BasicBlock::iterator II;
		for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) {
		PHINode *PN = dyn_cast<PHINode>(II);
		mgrangUnsubmitted Done Reply Inline Actions You can combine the assignment and check into a single if: if (PHINode PN = dyn_cast<PHINode>(II)) mgrang:* You can combine the assignment and check into a single if: ``` if (PHINode *PN =…
		if (PN) {
		// Look for users in BB2.
		bool InBBUser = false;
		for (User *U : PN->users()) {
		if (cast<Instruction>(U)->getParent() == BB2) {
		InBBUser = true;
		break;
		}
		}
		// No such user, we don't care about this instruction and its operands.
		if (!InBBUser)
		break;
		}
		Worklist.push_back(&(*II));
		}

		while (!Worklist.empty()) {
		Instruction *I = Worklist.pop_back_val();
		for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) {
		Instruction *op = dyn_cast<Instruction>(I->getOperand(O));
		if (op) {
		mgrangUnsubmitted Done Reply Inline Actions Same here. Can combine the assignment and check into single if. mgrang: Same here. Can combine the assignment and check into single if.
		if (op->getParent() == IfBlock1 \|\| op->getParent() == IfBlock2)
		Worklist.push_back(op);
		else if (op->getParent() == BB1 && BB1_instructions.count(op)) {
		if (BB1_instructions[op] > LastLatency)
		LastLatency = BB1_instructions[op];
		}
		}
		}
		}

		return LastLatency;
		}

		/// If after if conversion, most of the instructions in new BB construct a
		/// long and slow dependence chain, it may be slower than cmp/branch, even
		/// if the branch has a high miss rate, because the data dependence is changed
		hfinkelUnsubmitted Done Reply Inline Actions in new BB -> in this new BB hfinkel: in new BB -> in this new BB
		/// into control dependence, and the long dependence chain is split into two,
		/// the two parts can be executed in parallel on modern OOO processor.
		static bool FindLongDependenceChain(BasicBlock BB1, BasicBlock BB2,
		hfinkelUnsubmitted Done Reply Inline Actions I find this sentence confusing. You're trying to say that the control dependence is faster than the data dependence, because the control dependence can be speculated (and thus, the second part can execute in parallel with the first). Right? This comment should also explain what this is checking. Something like: When considering whether to perform if-conversion, find the length of the dependence chain in BB1 (only the part that can be executed in parallel with code after branch in BB2) before cmp, and if the length is longer than a threshold, don't perform if-conversion." Document what SpeculationSize is. hfinkel: I find this sentence confusing. You're trying to say that the control dependence is faster than…
		BasicBlock IfBlock1, BasicBlock IfBlock2,
		hfinkelUnsubmitted Done Reply Inline Actions ... because the data dependence is changed into control dependence, and ... This is backward. It should say, "... because the control dependence is transformed into a data dependence, and the control dependence can be speculated, and ...". hfinkel: ... because the data dependence is changed into control dependence, and ... This is backward.
		int speculation_size,
		const TargetTransformInfo &TTI) {
		// Accumulated latency of each instruction in their BBs.
		std::map<Instruction *, int> BB1_instructions;
		mgrangUnsubmitted Done Reply Inline Actions How big are the keys of your map? Can you use llvm's map containers (like DenseMap, etc)? mgrang: How big are the keys of your map? Can you use llvm's map containers (like DenseMap, etc)?
		CarrotAuthorUnsubmitted Not Done Reply Inline Actions The size of the map is BB.size() - 1, and can't be greater than 40. Carrot: The size of the map is BB.size() - 1, and can't be greater than 40.
		std::map<Instruction *, int> BB2_instructions;

		if (!TTI.isOutOfOrder())
		return false;

		int new_BB_size = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI)
		mgrangUnsubmitted Done Reply Inline Actions Uppercase variable names. mgrang: Uppercase variable names.
		+ speculation_size;

		// We check small BB only since it is more difficult to find unrelated
		// instructions to fill functional units in small BB.
		if (new_BB_size > 40)
		return false;

		hfinkelUnsubmitted Done Reply Inline Actions Please add a cl::opt for this threshold. hfinkel: Please add a cl::opt for this threshold.
		auto BB1_chain =
		FindDependenceChainLatency(BB1, BB1_instructions, TTI, false);
		auto BB2_chain =
		FindDependenceChainLatency(BB2, BB2_instructions, TTI, true);

		// If we have a good ILP (IPC>=2) in new BB, then we don't care about the
		// latency of the dependence chain.
		if ((BB1_chain + BB2_chain) * 2 <= new_BB_size)
		return false;
		hfinkelUnsubmitted Done Reply Inline Actions You should comment that you're estimating that if there are lots of other instructions in the new BB, they'll be other instructions for the processor to issue regardless of the length of this new dependence chain. hfinkel: You should comment that you're estimating that if there are lots of other instructions in the…
		hfinkelUnsubmitted Done Reply Inline Actions in small BB. -> in a small BB. hfinkel: in small BB. -> in a small BB.

		hfinkelUnsubmitted Not Done Reply Inline Actions Should this 2 be the SchedModel's IssueWidth? hfinkel: Should this 2 be the SchedModel's IssueWidth?
		CarrotAuthorUnsubmitted Not Done Reply Inline Actions In real world applications, an IPC of 2 is already very good for 3 issue or 4 issue processors. Higher IPC is usually found in programs with small kernel. So I think 2 is more reasonable for most applications. Carrot: In real world applications, an IPC of 2 is already very good for 3 issue or 4 issue processors.
		hfinkelUnsubmitted Done Reply Inline Actions In real world applications, an IPC of 2 is already very good for 3 issue or 4 issue processors. Higher IPC is usually found in programs with small kernel. So I think 2 is more reasonable for most applications. Fair enough. As we both know, there are all sorts of issues affecting this, but I agree that for non-loop code with small basic blocks, this is likely true. Please mention this assumption in the comment. hfinkel: > In real world applications, an IPC of 2 is already very good for 3 issue or 4 issue…
		// We only care about part of the dependence chain in BB1 that can be
		// executed in parallel with BB2, so adjust the latency.
		BB1_chain -=
		LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1_instructions);

		// Correctly predicted branch instruction can skip the dependence chain in
		// BB1, but misprediction has a penalty, so only when the dependence chain is
		// longer than DependenceChainLatency, then branch is better than select.
		if (BB1_chain >= DependenceChainLatency)
		return true;

		hfinkelUnsubmitted Not Done Reply Inline Actions Is there a way to think of the proper value of DependenceChainLatency in terms of the branch-misprediction penalty? hfinkel: Is there a way to think of the proper value of DependenceChainLatency in terms of the branch…
		CarrotAuthorUnsubmitted Not Done Reply Inline Actions DependenceChainLatency depends on many factors, besides branch misprediction penalty, there are also taken branch latency, branch mis prediction rate, cmov latency. Carrot: DependenceChainLatency depends on many factors, besides branch misprediction penalty, there are…
		hfinkelUnsubmitted Done Reply Inline Actions Please add this to the comment. hfinkel: Please add this to the comment.
		return false;
		}

/// Extract ConstantInt from value, looking through IntToPtr		/// Extract ConstantInt from value, looking through IntToPtr
/// and PointerNullValue. Return NULL if value is not a constant int.		/// and PointerNullValue. Return NULL if value is not a constant int.
static ConstantInt GetConstantInt(Value V, const DataLayout &DL) {		static ConstantInt GetConstantInt(Value V, const DataLayout &DL) {
// Normal constant int.		// Normal constant int.
ConstantInt *CI = dyn_cast<ConstantInt>(V);		ConstantInt *CI = dyn_cast<ConstantInt>(V);
if (CI \|\| !isa<Constant>(V) \|\| !V->getType()->isPointerTy())		if (CI \|\| !isa<Constant>(V) \|\| !V->getType()->isPointerTy())
return CI;		return CI;

▲ Show 20 Lines • Show All 1,637 Lines • ▼ Show 20 Lines	if (SpeculationCost > 1)
return false;		return false;
}		}

// If there are no PHIs to process, bail early. This helps ensure idempotence		// If there are no PHIs to process, bail early. This helps ensure idempotence
// as well.		// as well.
if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))		if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
return false;		return false;

		// Don't do if conversion for long dependence chain.
		if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr,
		CountBBCodeSize(ThenBB, TTI), TTI))
		return false;

// If we get here, we can hoist the instruction and if-convert.		// If we get here, we can hoist the instruction and if-convert.
DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);		DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);

// Insert a select of the value of the speculated store.		// Insert a select of the value of the speculated store.
if (SpeculatedStoreValue) {		if (SpeculatedStoreValue) {
IRBuilder<NoFolder> Builder(BI);		IRBuilder<NoFolder> Builder(BI);
Value *TrueV = SpeculatedStore->getValueOperand();		Value *TrueV = SpeculatedStore->getValueOperand();
Value *FalseV = SpeculatedStoreValue;		Value *FalseV = SpeculatedStoreValue;
▲ Show 20 Lines • Show All 291 Lines • ▼ Show 20 Lines	for (BasicBlock::iterator I = IfBlock2->begin(); !isa<TerminatorInst>(I);
if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {		if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
// This is not an aggressive instruction that we can promote.		// This is not an aggressive instruction that we can promote.
// Because of this, we won't be able to get rid of the control flow, so		// Because of this, we won't be able to get rid of the control flow, so
// the xform is not worth it.		// the xform is not worth it.
return false;		return false;
}		}
}		}

		if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2,
		AggressiveInsts.size(), TTI))
		return false;

DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: "		DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: "
<< IfTrue->getName() << " F: " << IfFalse->getName() << "\n");		<< IfTrue->getName() << " F: " << IfFalse->getName() << "\n");

// If we can still promote the PHI nodes after this gauntlet of tests,		// If we can still promote the PHI nodes after this gauntlet of tests,
// do all of the PHI's now.		// do all of the PHI's now.
Instruction *InsertPt = DomBlock->getTerminator();		Instruction *InsertPt = DomBlock->getTerminator();
IRBuilder<NoFolder> Builder(InsertPt);		IRBuilder<NoFolder> Builder(InsertPt);

▲ Show 20 Lines • Show All 3,692 Lines • Show Last 20 Lines

test/Transforms/SimplifyCFG/X86/if-conversion.ll

				; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S \| FileCheck %s
				; Avoid if conversion if there is a long dependence chain.

				target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

				define i64 @foo(i64** %pp, i64* %p) {
				entry:
				%0 = load i64, i64* %pp, align 8
				%1 = load i64, i64* %0, align 8
				%cmp = icmp slt i64 %1, 0
				%pint = ptrtoint i64* %p to i64
				br i1 %cmp, label %cond.true, label %cond.false

				cond.true:
				%p1 = add i64 %pint, 8
				br label %cond.end

				cond.false:
				%p2 = or i64 %pint, 16
				br label %cond.end

				cond.end:
				%p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
				%ptr = inttoptr i64 %p3 to i64*
				%val = load i64, i64* %ptr, align 8
				ret i64 %val

				; CHECK-NOT: select
				}

				define i64 @bar(i64** %pp, i64* %p) {
				entry:
				%0 = load i64, i64* %pp, align 8
				%1 = load i64, i64* %0, align 8
				%cmp = icmp slt i64 %1, 0
				%pint = ptrtoint i64* %p to i64
				br i1 %cmp, label %cond.true, label %cond.false

				cond.true:
				%p1 = add i64 %pint, 8
				br label %cond.end

				cond.false:
				%p2 = add i64 %pint, 16
				br label %cond.end

				cond.end:
				%p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
				%ptr = inttoptr i64 %p3 to i64*
				%val = load i64, i64* %ptr, align 8
				ret i64 %val

				; CHECK-LABEL: @bar
				; CHECK-NOT: select
				}

This is an archive of the discontinued LLVM Phabricator instance.

[SimplifyCFG] Don't do if-conversion if there is a long dependence chainClosedPublic

Details

BB#4:

Diff Detail

Event Timeline

Revision Contents

Diff 123686

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

include/llvm/CodeGen/BasicTTIImpl.h

lib/Analysis/TargetTransformInfo.cpp

lib/Transforms/Utils/SimplifyCFG.cpp

test/Transforms/SimplifyCFG/X86/if-conversion.ll

[SimplifyCFG] Don't do if-conversion if there is a long dependence chain
ClosedPublic