This is an archive of the discontinued LLVM Phabricator instance.

Value: Provide a shortcut in stripPointerCastsAndOffsets()
AbandonedPublic

Authored by MatzeB on Jan 26 2016, 8:15 PM.

Details

Summary

The majority of stripPointerCastsAndOffsets() queries can be resolved
after checking 1 or 2 values. This patch delays the creation and
insertion into a SmallPtrSet until we have to check more than that. This
slightly improves compile time in my benchmarks.

Diff Detail

Repository
rL LLVM

Event Timeline

MatzeB updated this revision to Diff 46096.Jan 26 2016, 8:15 PM
MatzeB retitled this revision from to Value: Provide a shortcut in stripPointerCastsAndOffsets().
MatzeB updated this object.
MatzeB added reviewers: chandlerc, sunfish, baldrick.
MatzeB set the repository for this revision to rL LLVM.
MatzeB added a subscriber: llvm-commits.
chandlerc edited edge metadata.Jan 26 2016, 8:24 PM

Ugh. It's pretty sad that the optimizer doesn't fix this.

lib/IR/Value.cpp
424

Maybe use a lambda instead?

Ugh. It's pretty sad that the optimizer doesn't fix this.

I recall also seeing this problem recently in a different context, and it looked like the reason why this mattered was because the constructor ends up zeroing the stack array (instead of leaving it uninitialized).

If we can fix the class implementation, or the optimizer, that would be better. It looks like there are two problems here, one is the memset (which is what I recall seeing in the profile), and the other is that we don't inline the destructor.

Specifically:

$ cat /tmp/q1.cpp 
#include <llvm/ADT/SmallPtrSet.h>
#include <llvm/IR/Type.h>
using namespace llvm;

bool foo(SmallPtrSetImpl<Type*> *Visited);

bool isSized() {
  SmallPtrSet<Type *, 4> V;
  return foo(&V);
}

$ clang++ -std=gnu++11 -O3 -S -emit-llvm -Iinclude -I/build/ppc64/llvm/include --gcc-toolchain=/install/ppc64/gcc -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -o - /tmp/q1.cpp -fno-exceptions
...
define zeroext i1 @_Z7isSizedv() #0 {
entry:
  %V = alloca %"class.llvm::SmallPtrSet", align 8
  %0 = bitcast %"class.llvm::SmallPtrSet"* %V to i8*
  call void @llvm.lifetime.start(i64 64, i8* %0) #3
  %arraydecay.i = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 1, i64 0
  %SmallArray.i.i.i = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 0
  store i8** %arraydecay.i, i8*** %SmallArray.i.i.i, align 8, !tbaa !1
  %1 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 1
  store i8** %arraydecay.i, i8*** %1, align 8, !tbaa !7
  %2 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 2
  store i32 4, i32* %2, align 8, !tbaa !8
  %3 = bitcast i8** %arraydecay.i to i8*
  %4 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 3
  call void @llvm.memset.p0i8.i64(i8* %3, i8 -1, i64 32, i32 8, i1 false) #3
  store i32 0, i32* %4, align 4, !tbaa !9
  %5 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 4
  store i32 0, i32* %5, align 8, !tbaa !10
  %6 = bitcast %"class.llvm::SmallPtrSet"* %V to %"class.llvm::SmallPtrSetImpl"*
  %call = call zeroext i1 @_Z3fooPN4llvm15SmallPtrSetImplIPNS_4TypeEEE(%"class.llvm::SmallPtrSetImpl"* %6) #3
  %7 = bitcast %"class.llvm::SmallPtrSet"* %V to %"class.llvm::SmallPtrSetImplBase"*
  call void @_ZN4llvm19SmallPtrSetImplBaseD2Ev(%"class.llvm::SmallPtrSetImplBase"* nonnull %7) #3
  call void @llvm.lifetime.end(i64 64, i8* %0) #3
  ret i1 %call
}
...

Consider, on the other hand, what happens with a SmallVector of the same type:

$ cat /tmp/q2.cpp 
#include <llvm/ADT/SmallVector.h>
#include <llvm/IR/Type.h>
using namespace llvm;

bool foo(SmallVectorImpl<Type*> *Visited);

bool isSized() {
  SmallVector<Type *, 4> V;
  return foo(&V);
}

$ clang ... /tmp/q2.cpp ...
...
define zeroext i1 @_Z7isSizedv() #0 {
entry:
  %V = alloca %"class.llvm::SmallVector", align 8
  %0 = bitcast %"class.llvm::SmallVector"* %V to i8*
  call void @llvm.lifetime.start(i64 56, i8* %0) #3
  %1 = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 0
  %BeginX.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0
  store i8* %1, i8** %BeginX.i.i.i.i.i, align 8, !tbaa !1
  %EndX.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
  store i8* %1, i8** %EndX.i.i.i.i.i, align 8, !tbaa !6
  %CapacityX.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 0, i32 2
  %add.ptr.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 32
  store i8* %add.ptr.i.i.i.i.i, i8** %CapacityX.i.i.i.i.i, align 8, !tbaa !7
  %2 = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0
  %call = call zeroext i1 @_Z3fooPN4llvm15SmallVectorImplIPNS_4TypeEEE(%"class.llvm::SmallVectorImpl"* %2) #3
  %3 = load i8*, i8** %BeginX.i.i.i.i.i, align 8, !tbaa !1
  %cmp.i.i = icmp eq i8* %3, %1
  br i1 %cmp.i.i, label %_ZN4llvm15SmallVectorImplIPNS_4TypeEED2Ev.exit, label %if.then.i
 
if.then.i:                                        ; preds = %entry
  call void @free(i8* %3) #3
  br label %_ZN4llvm15SmallVectorImplIPNS_4TypeEED2Ev.exit
 
_ZN4llvm15SmallVectorImplIPNS_4TypeEED2Ev.exit:   ; preds = %entry, %if.then.i
  call void @llvm.lifetime.end(i64 56, i8* %0) #3
  ret i1 %call
}  
...

Here we have no memset and the destructor is inlined.

MatzeB abandoned this revision.Jan 27 2016, 10:14 PM

I just posted patches that

Ugh. It's pretty sad that the optimizer doesn't fix this.

I recall also seeing this problem recently in a different context, and it looked like the reason why this mattered was because the constructor ends up zeroing the stack array (instead of leaving it uninitialized).

If we can fix the class implementation, or the optimizer, that would be better. It looks like there are two problems here, one is the memset (which is what I recall seeing in the profile), and the other is that we don't inline the destructor.

Specifically:

$ cat /tmp/q1.cpp 
#include <llvm/ADT/SmallPtrSet.h>
#include <llvm/IR/Type.h>
using namespace llvm;

bool foo(SmallPtrSetImpl<Type*> *Visited);

bool isSized() {
  SmallPtrSet<Type *, 4> V;
  return foo(&V);
}

$ clang++ -std=gnu++11 -O3 -S -emit-llvm -Iinclude -I/build/ppc64/llvm/include --gcc-toolchain=/install/ppc64/gcc -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -o - /tmp/q1.cpp -fno-exceptions
...
define zeroext i1 @_Z7isSizedv() #0 {
entry:
  %V = alloca %"class.llvm::SmallPtrSet", align 8
  %0 = bitcast %"class.llvm::SmallPtrSet"* %V to i8*
  call void @llvm.lifetime.start(i64 64, i8* %0) #3
  %arraydecay.i = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 1, i64 0
  %SmallArray.i.i.i = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 0
  store i8** %arraydecay.i, i8*** %SmallArray.i.i.i, align 8, !tbaa !1
  %1 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 1
  store i8** %arraydecay.i, i8*** %1, align 8, !tbaa !7
  %2 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 2
  store i32 4, i32* %2, align 8, !tbaa !8
  %3 = bitcast i8** %arraydecay.i to i8*
  %4 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 3
  call void @llvm.memset.p0i8.i64(i8* %3, i8 -1, i64 32, i32 8, i1 false) #3
  store i32 0, i32* %4, align 4, !tbaa !9
  %5 = getelementptr inbounds %"class.llvm::SmallPtrSet", %"class.llvm::SmallPtrSet"* %V, i64 0, i32 0, i32 0, i32 4
  store i32 0, i32* %5, align 8, !tbaa !10
  %6 = bitcast %"class.llvm::SmallPtrSet"* %V to %"class.llvm::SmallPtrSetImpl"*
  %call = call zeroext i1 @_Z3fooPN4llvm15SmallPtrSetImplIPNS_4TypeEEE(%"class.llvm::SmallPtrSetImpl"* %6) #3
  %7 = bitcast %"class.llvm::SmallPtrSet"* %V to %"class.llvm::SmallPtrSetImplBase"*
  call void @_ZN4llvm19SmallPtrSetImplBaseD2Ev(%"class.llvm::SmallPtrSetImplBase"* nonnull %7) #3
  call void @llvm.lifetime.end(i64 64, i8* %0) #3
  ret i1 %call
}
...

Consider, on the other hand, what happens with a SmallVector of the same type:

$ cat /tmp/q2.cpp 
#include <llvm/ADT/SmallVector.h>
#include <llvm/IR/Type.h>
using namespace llvm;

bool foo(SmallVectorImpl<Type*> *Visited);

bool isSized() {
  SmallVector<Type *, 4> V;
  return foo(&V);
}

$ clang ... /tmp/q2.cpp ...
...
define zeroext i1 @_Z7isSizedv() #0 {
entry:
  %V = alloca %"class.llvm::SmallVector", align 8
  %0 = bitcast %"class.llvm::SmallVector"* %V to i8*
  call void @llvm.lifetime.start(i64 56, i8* %0) #3
  %1 = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 0
  %BeginX.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0
  store i8* %1, i8** %BeginX.i.i.i.i.i, align 8, !tbaa !1
  %EndX.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
  store i8* %1, i8** %EndX.i.i.i.i.i, align 8, !tbaa !6
  %CapacityX.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 0, i32 2
  %add.ptr.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 32
  store i8* %add.ptr.i.i.i.i.i, i8** %CapacityX.i.i.i.i.i, align 8, !tbaa !7
  %2 = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %V, i64 0, i32 0
  %call = call zeroext i1 @_Z3fooPN4llvm15SmallVectorImplIPNS_4TypeEEE(%"class.llvm::SmallVectorImpl"* %2) #3
  %3 = load i8*, i8** %BeginX.i.i.i.i.i, align 8, !tbaa !1
  %cmp.i.i = icmp eq i8* %3, %1
  br i1 %cmp.i.i, label %_ZN4llvm15SmallVectorImplIPNS_4TypeEED2Ev.exit, label %if.then.i
 
if.then.i:                                        ; preds = %entry
  call void @free(i8* %3) #3
  br label %_ZN4llvm15SmallVectorImplIPNS_4TypeEED2Ev.exit
 
_ZN4llvm15SmallVectorImplIPNS_4TypeEED2Ev.exit:   ; preds = %entry, %if.then.i
  call void @llvm.lifetime.end(i64 56, i8* %0) #3
  ret i1 %call
}  
...

Here we have no memset and the destructor is inlined.

I just posted reviews that should avoid the memset in the small SmallPtrSet cases. I can't measure any signifant differences with it, so we can abandon this (assuming the SmallPtrSet changes get approved).

I just posted patches that

...

I just posted reviews that should avoid the memset in the small SmallPtrSet cases. I can't measure any signifant differences with it, so we can abandon this (assuming the SmallPtrSet changes get approved).

Great, thanks!