Skip to content

Commit a108a65

Browse files
committedMay 1, 2014
Add an optimization that does CSE in a group of similar GEPs.
This optimization merges the common part of a group of GEPs, so we can compute each pointer address by adding a simple offset to the common part. The optimization is currently only enabled for the NVPTX backend, where it has a large payoff on some benchmarks. Review: http://reviews.llvm.org/D3462 Patch by Jingyue Wu. llvm-svn: 207783
1 parent 748be6c commit a108a65

File tree

9 files changed

+774
-4
lines changed

9 files changed

+774
-4
lines changed
 

‎llvm/include/llvm/InitializePasses.h

+1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ void initializeSimpleInlinerPass(PassRegistry&);
238238
void initializeRegisterCoalescerPass(PassRegistry&);
239239
void initializeSingleLoopExtractorPass(PassRegistry&);
240240
void initializeSinkingPass(PassRegistry&);
241+
void initializeSeparateConstOffsetFromGEPPass(PassRegistry &);
241242
void initializeSlotIndexesPass(PassRegistry&);
242243
void initializeSpillPlacementPass(PassRegistry&);
243244
void initializeStackProtectorPass(PassRegistry&);

‎llvm/include/llvm/LinkAllPasses.h

+1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ namespace {
156156
(void) llvm::createBBVectorizePass();
157157
(void) llvm::createPartiallyInlineLibCallsPass();
158158
(void) llvm::createScalarizerPass();
159+
(void) llvm::createSeparateConstOffsetFromGEPPass();
159160

160161
(void)new llvm::IntervalPartition();
161162
(void)new llvm::FindUsedTypes();

‎llvm/include/llvm/Transforms/Scalar.h

+6
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,12 @@ FunctionPass *createScalarizerPass();
377377
// AddDiscriminators - Add DWARF path discriminators to the IR.
378378
FunctionPass *createAddDiscriminatorsPass();
379379

380+
//===----------------------------------------------------------------------===//
381+
//
382+
// SeparateConstOffsetFromGEP - Split GEPs for better CSE
383+
//
384+
FunctionPass *createSeparateConstOffsetFromGEPPass();
385+
380386
} // End llvm namespace
381387

382388
#endif

‎llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

+17-4
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,23 @@ void NVPTXPassConfig::addIRPasses() {
147147
addPass(createNVPTXAssignValidGlobalNamesPass());
148148
addPass(createGenericToNVVMPass());
149149
addPass(createNVPTXFavorNonGenericAddrSpacesPass());
150-
// The FavorNonGenericAddrSpaces pass may remove instructions and leave some
151-
// values unused. Therefore, we run a DCE pass right afterwards. We could
152-
// remove unused values in an ad-hoc manner, but it requires manual work and
153-
// might be error-prone.
150+
addPass(createSeparateConstOffsetFromGEPPass());
151+
// The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
152+
// by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
153+
// significantly better code than EarlyCSE for some of our benchmarks.
154+
if (getOptLevel() == CodeGenOpt::Aggressive)
155+
addPass(createGVNPass());
156+
else
157+
addPass(createEarlyCSEPass());
158+
// Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave
159+
// some dead code. We could remove dead code in an ad-hoc manner, but that
160+
// requires manual work and might be error-prone.
161+
//
162+
// The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts,
163+
// and leave them unused.
164+
//
165+
// SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the
166+
// old index and some of its intermediate results may become unused.
154167
addPass(createDeadCodeEliminationPass());
155168
}
156169

‎llvm/lib/Transforms/Scalar/Scalar.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
6464
initializeStructurizeCFGPass(Registry);
6565
initializeSinkingPass(Registry);
6666
initializeTailCallElimPass(Registry);
67+
initializeSeparateConstOffsetFromGEPPass(Registry);
6768
}
6869

6970
void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {

‎llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

+583
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
targets = set(config.root.targets_to_build.split())
2+
if not 'NVPTX' in targets:
3+
config.unsupported = True
4+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
3+
; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR
4+
5+
; Verifies the SeparateConstOffsetFromGEP pass.
6+
; The following code computes
7+
; *output = array[x][y] + array[x][y+1] + array[x+1][y] + array[x+1][y+1]
8+
;
9+
; We expect SeparateConstOffsetFromGEP to transform it to
10+
;
11+
; float *base = &a[x][y];
12+
; *output = base[0] + base[1] + base[32] + base[33];
13+
;
14+
; so the backend can emit PTX that uses fewer virtual registers.
15+
16+
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
17+
target triple = "nvptx64-unknown-unknown"
18+
19+
@array = internal addrspace(3) constant [32 x [32 x float]] zeroinitializer, align 4
20+
21+
define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
22+
.preheader:
23+
%0 = zext i32 %y to i64
24+
%1 = zext i32 %x to i64
25+
%2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
26+
%3 = addrspacecast float addrspace(3)* %2 to float*
27+
%4 = load float* %3, align 4
28+
%5 = fadd float %4, 0.000000e+00
29+
%6 = add i32 %y, 1
30+
%7 = zext i32 %6 to i64
31+
%8 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7
32+
%9 = addrspacecast float addrspace(3)* %8 to float*
33+
%10 = load float* %9, align 4
34+
%11 = fadd float %5, %10
35+
%12 = add i32 %x, 1
36+
%13 = zext i32 %12 to i64
37+
%14 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0
38+
%15 = addrspacecast float addrspace(3)* %14 to float*
39+
%16 = load float* %15, align 4
40+
%17 = fadd float %11, %16
41+
%18 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7
42+
%19 = addrspacecast float addrspace(3)* %18 to float*
43+
%20 = load float* %19, align 4
44+
%21 = fadd float %17, %20
45+
store float %21, float* %output, align 4
46+
ret void
47+
}
48+
49+
; PTX-LABEL: sum_of_array(
50+
; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
51+
; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
52+
; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
53+
; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
54+
55+
; IR-LABEL: @sum_of_array(
56+
; IR: [[BASE_PTR:%[0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i32 %x, i32 %y
57+
; IR: [[BASE_INT:%[0-9]+]] = ptrtoint float addrspace(3)* [[BASE_PTR]] to i64
58+
; IR: %5 = add i64 [[BASE_INT]], 4
59+
; IR: %10 = add i64 [[BASE_INT]], 128
60+
; IR: %15 = add i64 [[BASE_INT]], 132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
; RUN: opt < %s -separate-const-offset-from-gep -dce -S | FileCheck %s
2+
3+
; Several unit tests for -separate-const-offset-from-gep. The transformation
4+
; heavily relies on TargetTransformInfo, so we put these tests under
5+
; target-specific folders.
6+
7+
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
8+
; target triple is necessary; otherwise TargetTransformInfo rejects any
9+
; addressing mode.
10+
target triple = "nvptx64-unknown-unknown"
11+
12+
%struct.S = type { float, double }
13+
14+
@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
15+
@float_2d_array = global [32 x [32 x float]] zeroinitializer, align 4
16+
17+
; We should not extract any struct field indices, because fields in a struct
18+
; may have different types.
19+
define double* @struct(i32 %i) {
20+
entry:
21+
%add = add nsw i32 %i, 5
22+
%idxprom = sext i32 %add to i64
23+
%p = getelementptr inbounds [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
24+
ret double* %p
25+
}
26+
; CHECK-LABEL: @struct
27+
; CHECK: getelementptr [1024 x %struct.S]* @struct_array, i64 0, i32 %i, i32 1
28+
29+
; We should be able to trace into sext/zext if it's directly used as a GEP
30+
; index.
31+
define float* @sext_zext(i32 %i, i32 %j) {
32+
entry:
33+
%i1 = add i32 %i, 1
34+
%j2 = add i32 %j, 2
35+
%i1.ext = sext i32 %i1 to i64
36+
%j2.ext = zext i32 %j2 to i64
37+
%p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i1.ext, i64 %j2.ext
38+
ret float* %p
39+
}
40+
; CHECK-LABEL: @sext_zext
41+
; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i32 %i, i32 %j
42+
; CHECK: add i64 %{{[0-9]+}}, 136
43+
44+
; We should be able to trace into sext/zext if it can be distributed to both
45+
; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b)
46+
define float* @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) {
47+
%b1 = add nsw i32 %b, 1
48+
%b2 = sext i32 %b1 to i64
49+
%i = add i64 %a, %b2
50+
%d1 = add nuw i32 %d, 1
51+
%d2 = zext i32 %d1 to i64
52+
%j = add i64 %c, %d2
53+
%p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
54+
ret float* %p
55+
}
56+
; CHECK-LABEL: @ext_add_no_overflow
57+
; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}
58+
; CHECK: [[BASE_INT:%[0-9]+]] = ptrtoint float* [[BASE_PTR]] to i64
59+
; CHECK: add i64 [[BASE_INT]], 132
60+
61+
; We should treat "or" with no common bits (%k) as "add", and leave "or" with
62+
; potentially common bits (%l) as is.
63+
define float* @or(i64 %i) {
64+
entry:
65+
%j = shl i64 %i, 2
66+
%k = or i64 %j, 3 ; no common bits
67+
%l = or i64 %j, 4 ; potentially common bits
68+
%p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %k, i64 %l
69+
ret float* %p
70+
}
71+
; CHECK-LABEL: @or
72+
; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %j, i64 %l
73+
; CHECK: add i64 %{{[0-9]+}}, 384
74+
75+
; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b +
76+
; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't
77+
; affected.
78+
define float* @expr(i64 %a, i64 %b, i64* %out) {
79+
entry:
80+
%b5 = add i64 %b, 5
81+
%i = add i64 %b5, %a
82+
%p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 0
83+
store i64 %b5, i64* %out
84+
ret float* %p
85+
}
86+
; CHECK-LABEL: @expr
87+
; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %0, i64 0
88+
; CHECK: add i64 %{{[0-9]+}}, 640
89+
; CHECK: store i64 %b5, i64* %out
90+
91+
; Verifies we handle "sub" correctly.
92+
define float* @sub(i64 %i, i64 %j) {
93+
%i2 = sub i64 %i, 5 ; i - 5
94+
%j2 = sub i64 5, %j ; 5 - i
95+
%p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i2, i64 %j2
96+
ret float* %p
97+
}
98+
; CHECK-LABEL: @sub
99+
; CHECK: %[[j2:[0-9]+]] = sub i64 0, %j
100+
; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
101+
; CHECK: add i64 %{{[0-9]+}}, -620

0 commit comments

Comments
 (0)
Please sign in to comment.