This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
polly/trunk/
-
trunk/
-
lib/Transform/
-
Transform/
-
ScheduleOptimizer.cpp
-
test/ScheduleOptimizer/
-
ScheduleOptimizer/
-
mat_mul_pattern_data_layout.ll

Differential D27878

[Polly] Use three-dimensional arrays to store packed operands of the matrix multiplication
ClosedPublic

Authored by gareevroman on Dec 17 2016, 8:04 AM.

Download Raw Diff

Details

Reviewers

Meinersbur
grosser
jdoerfert

Commits

rG92c446016a72: [Polly] Use three-dimensional arrays to store packed operands of the matrix…
rPLO290251: [Polly] Use three-dimensional arrays to store packed operands of the matrix
rL290251: [Polly] Use three-dimensional arrays to store packed operands of the matrix

Summary

Previously we had two-dimensional accesses to store packed operands of the matrix multiplication for the sake of simplicity of the packed arrays. However, addition of the third dimension helps to simplify the corresponding memory access, reduce the execution time of isl operations applied to it, and consequently reduce the compile-time of Polly. For example, in case of Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME -march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true -DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8 -mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm -polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816 seconds.

Diff Detail

Repository: rL LLVM

Event Timeline

gareevroman updated this revision to Diff 81843.Dec 17 2016, 8:04 AM

gareevroman retitled this revision from to [Polly] Compute the lexicographic minimum in Memory::applyScheduleToAccessRelation, when the schedule is applied.

gareevroman updated this object.

gareevroman added reviewers: grosser, Meinersbur, jdoerfert.

gareevroman added a subscriber: pollydev.

For example, getAddressFunction() can return the following map . Subsequently, Memory::applyScheduleToAccessRelation tries to apply the schedule to its domain.

{ CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1] : (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((-33i0 + 44*floor((143i0)/144))/28), e3 = floor((-99i0 + 442*floor((143i0)/144))/275), e4 = floor((1 + i0)/3), e5 = floor((-165 + 1061i0 - 858*floor((143i0)/144))/170), e6 = floor((2 + 2i0)/3), e7 = floor((1734i0 + i2 - 1632*floor((143i0)/144))/170), e8 = floor((2i0)/3), e9 = floor((-5 + 51i0 - 48*floor((143i0)/144))/10): 5e1 = -6i0 + 8e0 and 5e2 = -11i0 + 13e0 and 5e3 = -5 + 32i0 - 26e0 and 5e4 = -5 + 97i0 - 96e0 and 5e5 = -5 + 27i0 - 21e0 and 5e6 = 51i0 - 48e0 and 170e7 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 5e8 = -5 + 51i0 - 48e0 and 3e9 = i0 - o1 and 144e0 <= -5 + 143i0 and 144e0 <= -15 + 143i0 + 10o1 and 144e0 >= -30 + 143i0 + 10o1 and 9792e0 <= -8160 + 9724i0 + o0 and 144e0 >= -10 + 143i0 and 24480e0 <= -24481 + 24310i0 + 3o0 and 9792e0 >= -8329 + 9724i0 + o0)) or (exists (e0 = floor((3i2)/170), e1 = floor((-i2 + o0)/170), e2 = floor((-115i0 + o1)/144): 170e1 = -i2 + o0 and 144e2 = -115i0 + o1 and o1 > 0 and 170e0 >= 24480 + 3i2 - 3o0 - 1020o1 and 170e0 <= 24650 + 3i2 - 3o0 - 1020o1 and 340o1 >= 8160 - o0 and o1 <= 2 and 170e0 >= -169 + 3i2 and 340o1 <= 8329 - o0 and 850o1 <= 24480 - 3o0 and 170e0 <= 3i2)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = i0 and o0 >= 0 and o0 <= 169)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-129 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -129 + i0 and o0 <= 7479 and o0 >= 7310)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-111 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -111 + i0 and o0 <= 6459 and o0 >= 6290)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-116 + i0)/144): o1 = 2 and 170e0 = -i2 + o0 and 144e1 = -116 + i0 and o0 >= 6574 and o0 <= 6629)) or (exists (e0 = floor((3i2)/170), e1 = floor((-116 + i0)/144), e2 = floor((-i2 + o0)/170), e3 = floor((-2 + o1)/3): 144e1 = -116 + i0 and 170e2 = -i2 + o0 and 3e3 = -2 + o1 and o0 <= 6573 and 170e0 <= 3i2 and o0 >= 6460 and 170e0 >= -169 + 3i2 and 170e0 <= 20145 + 3i2 - 3o0 - 170o1 and 170e0 >= 19720 + 3i2 - 3o0 - 170o1)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-93 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -93 + i0 and o0 <= 5439 and o0 >= 5270)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((4 - 11i0 + 24*floor((143i0)/144))/21), e3 = floor((-6 - 33i0 + 44*floor((143i0)/144))/28), e4 = floor((-7 - 22i0 + 91*floor((143i0)/144))/55), e5 = floor((-102 + 1263i0 - 1054*floor((143i0)/144))/175), e6 = floor((-42 + 463i0 - 364*floor((143i0)/144))/85), e7 = floor((-81 + 59i0 + 108*floor((143i0)/144))/140), e8 = floor((-12 - 5i0 + 36*floor((143i0)/144))/27), e9 = floor((1 + i0)/3), e10 = floor((-57 - 717i0 + 836*floor((143i0)/144))/100), e11 = floor((2 + 2i0)/3), e12 = floor((34 + 1734i0 + i2 - 1632*floor((143i0)/144))/170), e13 = floor((2i0)/3), e14 = floor((-4 + 51i0 - 48*floor((143i0)/144))/10): 5e1 = -1 - 6i0 + 8e0 and 5e2 = -4 + 11i0 - 8e0 and 5e3 = -1 - 11i0 + 13e0 and 5e4 = -3 + 37i0 - 31e0 and 5e5 = -3 + 32i0 - 26e0 and 5e6 = -3 + 2i0 + 4e0 and 5e7 = -3 - 3i0 + 9e0 and 5e8 = -3 - 38i0 + 44e0 and 5e9 = -3 + 97i0 - 96e0 and 5e10 = -3 - 43i0 + 49e0 and 5e11 = 1 + 51i0 - 48e0 and 170e12 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 5e13 = -4 + 51i0 - 48e0 and 3e14 = i0 - o1 and 144e0 >= -7 + 143i0 and 24480e0 <= -24481 + 24310i0 + 3o0 and 9792e0 >= -8295 + 9724i0 + o0 and 9792e0 <= -8126 + 9724i0 + o0 and 144e0 <= -2 + 143i0 and 144e0 >= -27 + 143i0 + 10o1 and 144e0 <= -12 + 143i0 + 10o1)) or (exists (e0 = floor((3i2)/170), e1 = floor((-i2 + o0)/170), e2 = floor((-87 - 115i0 + o1)/144): 170e1 = -i2 + o0 and 144e2 = -87 - 115i0 + o1 and o1 <= 2 and 850o1 <= 24990 - 3o0 and 170e0 <= 3i2 and 340o1 <= 8499 - o0 and o1 > 0 and 340o1 >= 8330 - o0 and 170e0 >= -169 + 3i2 and 170e0 >= 24990 + 3i2 - 3o0 - 1020o1 and 170e0 <= 25160 + 3i2 - 3o0 - 1020o1)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-118 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -118 + i0 and o0 >= 6687 and o0 <= 6799)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-118 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -118 + i0 and o0 >= 6630 and o0 <= 6686)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-123 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -123 + i0 and o0 <= 7139 and o0 >= 6970)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-101 + i0)/144): o1 = 2 and 170e0 = -i2 + o0 and 144e1 = -101 + i0 and o0 <= 5779 and o0 >= 5724)) or (exists (e0 = floor((-101 + i0)/144), e1 = floor((-i2 + o0)/170), e2 = floor((-2 + o1)/3): 144e0 = -101 + i0 and 170e1 = -i2 + o0 and 3e2 = -2 + o1 and o1 >= 2 and o0 <= 5723 and o0 >= 5610 and o1 <= 3)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-117 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -117 + i0 and o0 <= 6799 and o0 >= 6630)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((-449 + 2101i0 - 1560*floor((143i0)/144))/903), e3 = floor((-344 + 825i0 - 715*floor((143i0)/144))/301), e4 = floor((-350 + 1012i0 - 427*floor((143i0)/144))/473), e5 = floor((-286 + 811i0 - 442*floor((143i0)/144))/301), e6 = floor((-588 + 1179i0 - 308*floor((143i0)/144))/731), e7 = floor((-311 + 1744i0 - 468*floor((143i0)/144))/903), e8 = floor((-459 + 706i0 - 351*floor((143i0)/144))/301), e9 = floor((-476 + 893i0 - 63*floor((143i0)/144))/387), e10 = floor((-1331 + 3676i0 - 1119*floor((143i0)/144))/903), e11 = floor((-319 + 825i0 - 414*floor((143i0)/144))/301), e12 = floor((-218 + 764i0 - 407*floor((143i0)/144))/215), e13 = floor((-36 + 138i0 - 68*floor((143i0)/144))/43), e14 = floor((1 + i0)/3), e15 = floor((-56 + 229i0 - 144*floor((143i0)/144))/86), e16 = floor((2 + 2i0)/3), e17 = floor((5610 - 3230i0 + 43i2 + 8160*floor((143i0)/144))/7310), e18 = floor((2i0)/3), e19 = floor((-10 - 19i0 + 48*floor((143i0)/144))/86): 43e1 = -50 + 120i0 - 104e0 and 43e2 = -49 + 66i0 - 40e0 and 43e3 = -50 + 77i0 - 61e0 and 43e4 = -42 + 118i0 - 65e0 and 43e5 = -42 + 75i0 - 22e0 and 43e6 = -68 + 103i0 - 52e0 and 43e7 = -24 + 49i0 + 12e0 and 43e8 = -68 + 60i0 - 9e0 and 43e9 = -94 + 131i0 - 39e0 and 43e10 = -91 + 141i0 - 19e0 and 43e11 = -50 + 77i0 - 18e0 and 43e12 = -66 + 210i0 - 139e0 and 43e13 = -36 + 138i0 - 68e0 and 43e14 = 23 - 81i0 + 96e0 and 43e15 = -56 + 186i0 - 144e0 and 43e16 = 33 - 19i0 + 48e0 and 170e17 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 43e18 = -10 - 19i0 + 48e0 and 3e19 = i0 - o1 and 144e0 <= 159 + 143i0 - 86o1 and 144e0 >= -56 + 143i0 and 144e0 >= 30 + 143i0 - 86o1 and 144e0 <= -13 + 143i0 and 24480e0 <= -24481 + 24310i0 + 3o0 and 342720e0 <= -345270 + 340340i0 + 43o0 and 342720e0 >= -352537 + 340340i0 + 43o0)) or (exists (e0 = floor((3i2)/170), e1 = floor((-i2 + o0)/170), e2 = floor((-9 - 67i0 + o1)/144): 170e1 = -i2 + o0 and 144e2 = -9 - 67i0 + o1 and o1 <= 2 and 170e0 >= -169 + 3i2 and 170e0 >= 7650 + 3i2 - 3o0 + 7140o1 and 170e0 <= 7820 + 3i2 - 3o0 + 7140o1 and 2380o1 <= -2550 + o0 and o1 > 0 and 2380o1 >= -2719 + o0 and 170e0 <= 3i2 and 7310o1 >= -7650 + 3o0)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-109 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -109 + i0 and o0 >= 6177 and o0 <= 6289)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-109 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -109 + i0 and o0 <= 6176 and o0 >= 6120)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-136 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -136 + i0 and o0 <= 7819 and o0 >= 7707)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-136 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -136 + i0 and o0 >= 7650 and o0 <= 7706)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21), e3 = floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11), e4 = floor((2i0 + 6*floor((143i0)/144) + 3*floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11))/11), e5 = floor((i0 + 2*floor((2i0 + 6*floor((143i0)/144) + 3*floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11))/11))/3), e6 = floor((3i0 + 2*floor((143i0)/144) + 3*floor((i0 + 2*floor((2i0 + 6*floor((143i0)/144) + 3*floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11))/11))/3))/4), e7 = floor((24479 + 24650i0 - 3o0 - 170o1)/24480), e8 = floor((2 + 2i0)/3), e9 = floor((i2)/170): 3e8 = 2i0 + o1 and 510e9 = 24650i0 + 3i2 - 3o0 - 170o1 - 24480e7 and 144e0 <= 143i0 and 144e0 >= -71 + 143i0 and 4e6 >= -3 + 3i0 + 2e0 + 3e5 and 432e7 <= 143i0 + 144e6 and 143e1 >= -142 + 56e0 and 18e1 <= -4 + 7i0 and 3e5 <= i0 + 2e4 and 21e2 >= -13 + 7i0 + 15e1 and 432e7 >= -431 + 143i0 + 144e6 and 11e4 >= -10 + 2i0 + 6e0 + 3e3 and o1 <= 2 and o1 >= 0 and 4e6 <= 3i0 + 2e0 + 3e5 and 11e3 <= 4i0 + 7e0 + 7e1 and 11e3 >= -10 + 4i0 + 7e0 + 7e1 and 24e3 >= -7 + 17i0 + 21e2 and 24480e7 >= 24650i0 - 3o0 - 170o1 and 11e4 <= 2i0 + 6e0 + 3e3 and 60e3 <= -18 + 103i0 - 60e0 + 51e2 and 3e5 >= -2 + i0 + 2e4 and 24480e7 <= 507 + 24650i0 - 3o0 - 170o1)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21), e3 = floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11), e4 = floor((10i0 + 29*floor((143i0)/144) + 29*floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11))/34), e5 = floor((17i0 + 6*floor((10i0 + 29*floor((143i0)/144) + 29*floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11))/34))/29), e6 = floor((i2)/170): 6e5 = -12 + 16i0 - o1 - 6e0 + 6e2 - 6e3 and 510e6 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 144e0 <= 143i0 and o1 >= 0 and 144e0 >= -71 + 143i0 and 36e4 <= -180 + 362i0 - 29o1 - 174e0 + 174e2 - 174e3 and 143e1 >= -142 + 56e0 and 18e1 <= -4 + 7i0 and 6e3 <= 6 - 26i0 + 5o1 + 30e0 + 6e2 and 21e2 >= -13 + 7i0 + 15e1 and o1 <= 2 and 36e4 >= -348 + 362i0 - 29o1 - 174e0 + 174e2 - 174e3 and 11e3 <= 4i0 + 7e0 + 7e1 and 34e4 <= 10i0 + 29e0 + 29e3 and 33e2 <= 11 - 13i0 + 24e0 + 24e1 and 143e1 <= 56e0 and 11e3 >= -10 + 4i0 + 7e0 + 7e1 and 15e4 >= -116 + 88i0 + 87e2 - 87e3 and 3e4 <= 130 - 139i0 + 87e0 + 58e1 - 58e2 + 58e3 and 24e3 <= -8 + 17i0 + 21e2 and 3e4 >= 73 - 139i0 + 87e0 + 58e1 - 58e2 + 58e3 and 1716e3 <= -6 - 286i0 - 143o1 + 1380e0 + 1716e2 and 1716e3 >= -1716 - 286i0 - 143o1 + 1380e0 + 1716e2 and 24480e0 <= -24480 + 24310i0 + 3o0 + 170o1 and 24480e0 >= -24987 + 24310i0 + 3o0 + 170o1 and 34e4 >= -33 + 10i0 + 29e0 + 29e3)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21), e3 = floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11), e4 = floor((10i0 + 29*floor((143i0)/144) + 29*floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11))/34), e5 = floor((17i0 + 6*floor((10i0 + 29*floor((143i0)/144) + 29*floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11))/34))/29), e6 = floor((2 + 2i0)/3), e7 = floor((i2)/170): 3e6 = 2i0 + o1 and 510e7 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 144e0 < 143i0 and 144e0 >= -71 + 143i0 and 29e5 <= 17i0 + 6e4 and 15e4 >= -116 + 88i0 + 87e2 - 87e3 and 143e1 >= -142 + 56e0 and 18e1 <= -4 + 7i0 and 21e2 <= 7 + 7i0 + 15e1 and 21e2 >= -13 + 7i0 + 15e1 and 11e3 >= -10 + 4i0 + 7e0 + 7e1 and 34e4 <= 10i0 + 29e0 + 29e3 and 29e5 >= -28 + 17i0 + 6e4 and 5e5 >= -8 + 9i0 + 6e2 - 6e3 and 33e2 <= 11 - 13i0 + 24e0 + 24e1 and 24480e0 >= -24987 + 24310i0 + 3o0 + 170o1 and 34e4 >= -33 + 10i0 + 29e0 + 29e3 and 143e1 <= 56e0 and 24e3 >= -28 + 17i0 + 21e2 and 24e3 <= -8 + 17i0 + 21e2 and 24480e0 <= -24480 + 24310i0 + 3o0 + 170o1 and o1 >= 0 and o1 <= 2 and 11e3 <= 4i0 + 7e0 + 7e1)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21), e3 = floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11), e4 = floor((20 + 29i0 + 3*floor((143i0)/144) + 3*floor((56*floor((143i0)/144))/143) + 9*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/105), e5 = floor((-170 + 107i0 + 34*floor((143i0)/144) + 34*floor((56*floor((143i0)/144))/143) + 102*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/175), e6 = floor((-145 + 137i0 + 174*floor((143i0)/144) + 29*floor((56*floor((143i0)/144))/143) + 87*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/170), e7 = floor((-185 + 481i0 - 33*floor((143i0)/144) - 198*floor((56*floor((143i0)/144))/143) + 396*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/435), e8 = floor((-280 + 206i0 - 28*floor((143i0)/144) - 168*floor((56*floor((143i0)/144))/143) + 336*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/145), e9 = floor((1 + i0)/3), e10 = floor((2 + 2i0)/3), e11 = floor((-34i0 + i2 + 272*floor((143i0)/144) + 272*floor((56*floor((143i0)/144))/143) - 374*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/170), e12 = floor((2i0)/3), e13 = floor((-i0 + 8*floor((143i0)/144) + 8*floor((56*floor((143i0)/144))/143) - 11*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/10): 5e3 = -5 + 3i0 + e0 + e1 + 3e2 and 5e4 = 2i0 - e0 - e1 + 2e2 and 5e5 = -5 - 2i0 + 6e0 + e1 + 3e2 and 5e6 = -10 + 7i0 - e0 - 6e1 + 12e2 and 5e7 = -10 + 9i0 - 7e0 - 17e1 + 19e2 and 5e8 = -10 + 2i0 + 4e0 - 6e1 + 12e2 and 5e9 = 5 - 7i0 + 16e0 + 16e1 - 22e2 and 5e10 = 5 - i0 + 8e0 + 8e1 - 11e2 and 170e11 = 7990 - 7990i0 + i2 - o0 + 8160e0 and 5e12 = -i0 + 8e0 + 8e1 - 11e2 and 3e13 = i0 - o1 and 144e0 <= 143i0 and 144e0 >= -71 + 143i0 and 143e1 >= -142 + 56e0 and 6e2 <= 74i0 - 72e0 + 3e1 and 321e2 >= 30 - 101i0 + 208e0 + 233e1 and 18e1 <= -4 + 7i0 and 1257e2 <= 685 - 302i0 + 576e0 + 1281e1 and 1257e2 >= 255 - 302i0 + 576e0 + 1281e1 and 321e2 <= 195 - 101i0 + 208e0 + 233e1 and 24480e0 <= -24481 + 24310i0 + 3o0 and 374e2 <= -7990 + 7956i0 + o0 - 7888e0 + 272e1 and 374e2 >= -8159 + 7956i0 + o0 - 7888e0 + 272e1 and 33e2 <= 10 - 13i0 + 24e0 + 24e1 and 33e2 >= -15 - 13i0 + 10o1 + 24e0 + 24e1 and 33e2 >= 5 - 13i0 + 24e0 + 24e1 and 33e2 <= -13i0 + 10o1 + 24e0 + 24e1)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21), e3 = floor((4i0 + 7*floor((143i0)/144) + 7*floor((56*floor((143i0)/144))/143))/11), e4 = floor((20 + 29i0 + 3*floor((143i0)/144) + 3*floor((56*floor((143i0)/144))/143) + 9*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/105), e5 = floor((-170 + 107i0 + 34*floor((143i0)/144) + 34*floor((56*floor((143i0)/144))/143) + 102*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/175), e6 = floor((-145 + 137i0 + 174*floor((143i0)/144) + 29*floor((56*floor((143i0)/144))/143) + 87*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/170), e7 = floor((-185 + 481i0 - 33*floor((143i0)/144) - 198*floor((56*floor((143i0)/144))/143) + 396*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/435), e8 = floor((-280 + 206i0 - 28*floor((143i0)/144) - 168*floor((56*floor((143i0)/144))/143) + 336*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/145), e9 = floor((1 + i0)/3), e10 = floor((2 + 2i0)/3), e11 = floor((-34i0 + i2 + 272*floor((143i0)/144) + 272*floor((56*floor((143i0)/144))/143) - 374*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/170), e12 = floor((3i2)/170), e13 = floor((2*floor((3i2)/170))/3), e14 = floor((i0 - 8*floor((143i0)/144) - 8*floor((56*floor((143i0)/144))/143) + 11*floor((7 + 7i0 + 15*floor((56*floor((143i0)/144))/143))/21))/5): 5e3 = -5 + 3i0 + e0 + e1 + 3e2 and 5e4 = 2i0 - e0 - e1 + 2e2 and 5e5 = -5 - 2i0 + 6e0 + e1 + 3e2 and 5e6 = -10 + 7i0 - e0 - 6e1 + 12e2 and 5e7 = -10 + 9i0 - 7e0 - 17e1 + 19e2 and 5e8 = -10 + 2i0 + 4e0 - 6e1 + 12e2 and 5e9 = 5 - 7i0 + 16e0 + 16e1 - 22e2 and 5e10 = 5 - i0 + 8e0 + 8e1 - 11e2 and 170e11 = 7990 - 7990i0 + i2 - o0 + 8160e0 and 85e13 = 7990 - 7956i0 + i2 - o0 + 7888e0 - 272e1 + 374e2 and 3e14 = 3 - 2i0 - o1 and 144e0 <= 143i0 and 144e0 >= -71 + 143i0 and 143e1 >= -142 + 56e0 and 1257e2 >= 255 - 302i0 + 576e0 + 1281e1 and 1257e2 <= 685 - 302i0 + 576e0 + 1281e1 and 18e1 <= -4 + 7i0 and 6e2 <= 74i0 - 72e0 + 3e1 and 170e12 >= -169 + 3i2 and 170e12 >= 23970 - 23868i0 + 3i2 - 3o0 + 23664e0 - 816e1 + 1122e2 and 374e2 <= -7990 + 7956i0 + o0 - 7888e0 + 272e1 and 321e2 >= 30 - 101i0 + 208e0 + 233e1 and 33e2 >= 5 - 13i0 + 24e0 + 24e1 and 33e2 <= 10 - 13i0 + 24e0 + 24e1 and 321e2 <= 195 - 101i0 + 208e0 + 233e1 and 170e12 <= 24140 - 23868i0 + 3i2 - 3o0 + 23664e0 - 816e1 + 1122e2 and 170e12 <= 3i2 and 24480e0 >= -24480 + 24310i0 + 3o0 and 374e2 >= -8159 + 7956i0 + o0 - 7888e0 + 272e1 and 33e2 >= 15 - 13i0 - 5o1 + 24e0 + 24e1 and 33e2 <= 21 - 13i0 - 5o1 + 24e0 + 24e1)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((-4 + 11i0)/18), e3 = floor((-110 + 55i0)/144), e4 = floor((5 + 110i0 + 36*floor((143i0)/144))/99), e5 = floor((-42 + 22i0 + 54*floor((143i0)/144))/55), e6 = floor((-140 + 196i0 + 117*floor((143i0)/144))/243), e7 = floor((-28 + 41i0)/36), e8 = floor((-19 - 16i0 + 57*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/60), e9 = floor((10 + 13i0 + 27*floor((143i0)/144) + 27*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/57), e10 = floor((-6 - 23i0 + 18*floor((143i0)/144) + 18*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/19), e11 = floor((-5 - 32i0 + 30*floor((143i0)/144) + 15*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/18), e12 = floor((1 + i0)/3), e13 = floor((2 + i0 + 2*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/4), e14 = floor((2 + 2i0)/3), e15 = floor((340 + 340i0 + 3i2)/510), e16 = floor((2i0)/3), e17 = floor((-1 + i0)/3): o1 = 2 and 18e1 = -14 + 7i0 and 18e2 = -4 + 11i0 and 18e3 = -14 - 11i0 + 18e0 and 18e5 = -14 - 11i0 + 36e0 and 18e6 = -28 + 41i0 - 18e0 and 3e7 = -1 - i0 + 3e4 and 3e8 = -1 - 4i0 + 3e0 + 3e4 and 9e9 = -7 - i0 + 27e0 - 9e4 and 3e10 = -1 - 7i0 + 6e0 + 3e4 and 18e11 = -22 - 43i0 + 90e0 - 18e4 and 3e12 = 1 + i0 and 2e13 = 3i0 - 4e0 + 2e4 and 3e14 = 2 + 2i0 and 170e15 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 3e16 = -1 + 2i0 and 3e17 = -2 + i0 and 2e4 <= 2 - 5i0 + 8e0 and 2e4 >= -5i0 + 8e0 and 144e0 >= -71 + 143i0 and 24480e0 <= -24481 + 24310i0 + 3o0 and 33e4 <= -2 - 11i0 + 60e0 and 36e4 >= -34 + 53i0 and 33e4 >= -17 - 11i0 + 60e0 and 144e0 <= -4 + 143i0 and 3e4 >= -53 + 64i0 - 60e0 and 36e4 <= -16 + 53i0 and 24480e0 >= -24647 + 24310i0 + 3o0)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((-4 + 11i0)/18), e3 = floor((-110 + 55i0)/144), e4 = floor((5 + 110i0 + 36*floor((143i0)/144))/99), e5 = floor((-42 + 22i0 + 54*floor((143i0)/144))/55), e6 = floor((-140 + 196i0 + 117*floor((143i0)/144))/243), e7 = floor((-28 + 41i0)/36), e8 = floor((-19 - 16i0 + 57*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/60), e9 = floor((10 + 13i0 + 27*floor((143i0)/144) + 27*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/57), e10 = floor((-6 - 23i0 + 18*floor((143i0)/144) + 18*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/19), e11 = floor((-5 - 32i0 + 30*floor((143i0)/144) + 15*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/18), e12 = floor((1 + i0)/3), e13 = floor((2 + i0 + 2*floor((5 + 110i0 + 36*floor((143i0)/144))/99))/4), e14 = floor((2 + 2i0)/3), e15 = floor((340 + 340i0 + 3i2)/510), e16 = floor((3i2)/170), e17 = floor((2*floor((3i2)/170))/3), e18 = floor((floor((2*floor((3i2)/170))/3))/2): 18e1 = -14 + 7i0 and 18e2 = -4 + 11i0 and 18e3 = -14 - 11i0 + 18e0 and 18e5 = -14 - 11i0 + 36e0 and 18e6 = -28 + 41i0 - 18e0 and 3e7 = -1 - i0 + 3e4 and 3e8 = -1 - 4i0 + 3e0 + 3e4 and 9e9 = -7 - i0 + 27e0 - 9e4 and 3e10 = -1 - 7i0 + 6e0 + 3e4 and 18e11 = -22 - 43i0 + 90e0 - 18e4 and 3e12 = 1 + i0 and 2e13 = 3i0 - 4e0 + 2e4 and 3e14 = 2 + 2i0 and 170e15 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 510e18 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 24480e0 >= -24480 + 24310i0 + 3o0 and 170e16 <= 3i2 and 144e0 >= -71 + 143i0 and 36e4 <= -16 + 53i0 and 36e4 >= -34 + 53i0 and 170e16 >= -169 + 3i2 and 3e17 <= 2e16 and 2e4 >= -5i0 + 8e0 and 3e17 >= -2 + 2e16 and 170e16 <= 24905 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 33e4 <= -2 - 11i0 + 60e0 and 24480e0 <= -24140 + 24310i0 + 3o0 and 144e0 <= -4 + 143i0 and 33e4 >= -17 - 11i0 + 60e0 and 170e16 >= 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 3e4 >= -53 + 64i0 - 60e0 and 2e4 <= 2 - 5i0 + 8e0)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-92 + i0)/144): o1 = 2 and 170e0 = -i2 + o0 and 144e1 = -92 + i0 and o0 >= 5214 and o0 <= 5269)) or (exists (e0 = floor((-92 + i0)/144), e1 = floor((-i2 + o0)/170), e2 = floor((-2 + o1)/3): 144e0 = -92 + i0 and 170e1 = -i2 + o0 and 3e2 = -2 + o1 and o1 <= 3 and o0 >= 5100 and o0 <= 5213 and o1 >= 2)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-84 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -84 + i0 and o0 >= 4760 and o0 <= 4929)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((-4 + 11i0)/18), e3 = floor((-103 + 55i0)/144), e4 = floor((7 + 110i0 + 36*floor((143i0)/144))/99), e5 = floor((-39 + 22i0 + 54*floor((143i0)/144))/55), e6 = floor((-130 + 196i0 + 117*floor((143i0)/144))/243), e7 = floor((2 - 17i0)/6), e8 = floor((1 + i0)/3), e9 = floor((-19 - 317i0 + 456*floor((143i0)/144))/120), e10 = floor((-3 - 49i0 + 90*floor((143i0)/144))/38), e11 = floor((-25 + 46i0 - 18*floor((143i0)/144))/27), e12 = floor((-35 - 67i0 + 180*floor((143i0)/144))/108), e13 = floor((-14 - 61i0 + 108*floor((143i0)/144))/45), e14 = floor((-7 - 47i0)/24), e15 = floor((2 + 2i0)/3), e16 = floor((170 + 340i0 + 3i2)/510), e17 = floor((2i0)/3), e18 = floor((-1 + i0)/3): o1 = 1 and 18e1 = -13 + 7i0 and 18e2 = -5 + 11i0 and 18e3 = -13 - 11i0 + 18e0 and 2e4 = 1 - 5i0 + 8e0 and 18e5 = -13 - 11i0 + 36e0 and 3e6 = 2 - 17i0 + 21e0 and 6e7 = -1 - 17i0 and 3e8 = -1 + i0 and 6e9 = -1 - 23i0 + 30e0 and 18e10 = -25 + 37i0 - 18e0 and 18e11 = -7 - 17i0 + 36e0 and 18e12 = -7 - 35i0 + 54e0 and 18e13 = -7 - 53i0 + 72e0 and 18e14 = -7 - 71i0 + 36e0 and 3e15 = 1 + 2i0 and 170e16 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 3e17 = -2 + 2i0 and 3e18 = -1 + i0 and 144e0 <= -29 + 143i0 and 24480e0 <= -24481 + 24310i0 + 3o0 and 144e0 >= -47 + 143i0 and 24480e0 >= -24817 + 24310i0 + 3o0)) or (exists (e0 = floor((143i0)/144), e1 = floor((56*floor((143i0)/144))/143), e2 = floor((-4 + 11i0)/18), e3 = floor((-103 + 55i0)/144), e4 = floor((7 + 110i0 + 36*floor((143i0)/144))/99), e5 = floor((-39 + 22i0 + 54*floor((143i0)/144))/55), e6 = floor((-130 + 196i0 + 117*floor((143i0)/144))/243), e7 = floor((2 - 17i0)/6), e8 = floor((1 + i0)/3), e9 = floor((-19 - 317i0 + 456*floor((143i0)/144))/120), e10 = floor((-3 - 49i0 + 90*floor((143i0)/144))/38), e11 = floor((-25 + 46i0 - 18*floor((143i0)/144))/27), e12 = floor((-35 - 67i0 + 180*floor((143i0)/144))/108), e13 = floor((-14 - 61i0 + 108*floor((143i0)/144))/45), e14 = floor((-7 - 47i0)/24), e15 = floor((2 + 2i0)/3), e16 = floor((170 + 340i0 + 3i2)/510), e17 = floor((1 + 2i0)/3), e18 = floor((2 + i0)/3): o1 = 1 and 18e1 = -13 + 7i0 and 18e2 = -5 + 11i0 and 18e3 = -13 - 11i0 + 18e0 and 2e4 = 1 - 5i0 + 8e0 and 18e5 = -13 - 11i0 + 36e0 and 3e6 = 2 - 17i0 + 21e0 and 6e7 = -1 - 17i0 and 3e8 = -1 + i0 and 6e9 = -1 - 23i0 + 30e0 and 18e10 = -25 + 37i0 - 18e0 and 18e11 = -7 - 17i0 + 36e0 and 18e12 = -7 - 35i0 + 54e0 and 18e13 = -7 - 53i0 + 72e0 and 18e14 = -7 - 71i0 + 36e0 and 3e15 = 1 + 2i0 and 170e16 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 3e17 = 1 + 2i0 and 3e18 = 2 + i0 and 144e0 <= -29 + 143i0 and 24480e0 >= -24479 + 24310i0 + 3o0 and 144e0 >= -47 + 143i0 and 24480e0 <= -24310 + 24310i0 + 3o0)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-42 - 25i0 + o1)/144): 170e0 = -i2 + o0 and 144e1 = -42 - 25i0 + o1 and o1 >= 0 and o1 <= 1 and 1360o1 >= 5780 - o0 and 1360o1 <= 5949 - o0)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-75 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -75 + i0 and o0 >= 4250 and o0 <= 4419)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-31i0 + 31*floor((143i0)/144) + 31*floor((-57i0 + 114*floor((143i0)/144))/142))/56), e4 = floor((2 + 2i0)/3), e5 = floor((i2)/170): 3e4 = 2i0 + o1 and 510e5 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 144e0 <= -72 + 143i0 and 144e0 >= -134 + 143i0 and 56e3 <= -31i0 + 31e0 + 31e2 and 429e1 <= 143 + 143i0 + 27e0 and 429e1 >= -285 + 143i0 + 27e0 and 24480e0 >= -24987 + 24310i0 + 3o0 + 170o1 and 56e3 >= -55 - 31i0 + 31e0 + 31e2 and 3e1 >= -25 + 28i0 - 27e0 and 142e2 <= -57i0 + 114e0 and 142e2 >= -141 - 57i0 + 114e0 and o1 <= 2 and o1 >= 0 and 27e3 >= -56 + 25i0 - 54e0 + 87e1 and 24480e0 <= -24480 + 24310i0 + 3o0 + 170o1)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((142*floor((143i0)/144))/143), e3 = floor((57*floor((142*floor((143i0)/144))/143))/142), e4 = floor((56*floor((57*floor((142*floor((143i0)/144))/143))/142))/57), e5 = floor((31*floor((56*floor((57*floor((142*floor((143i0)/144))/143))/142))/57))/56), e6 = floor((29*floor((31*floor((56*floor((57*floor((142*floor((143i0)/144))/143))/142))/57))/56))/31), e7 = floor((2*floor((29*floor((31*floor((56*floor((57*floor((142*floor((143i0)/144))/143))/142))/57))/56))/31))/29), e8 = floor((i0)/144), e9 = floor((2 + 2i0)/3), e10 = floor((i2)/170): 3e9 = 2i0 + o1 and 510e10 = 170i0 + 3i2 - 3o0 - 170o1 - 24480e8 and 24480e8 <= 507 + 170i0 - 3o0 - 170o1 and 144e0 >= -143 + 143i0 and 144e0 <= -72 + 143i0 and 429e1 <= 143 + 143i0 + 27e0 and 429e1 >= -285 + 143i0 + 27e0 and 143e2 <= 142e0 and 143e2 >= -142 + 142e0 and 27e2 >= -142 - 142i0 + 426e1 and 142e3 <= 57e2 and 142e3 >= -141 + 57e2 and 9e3 >= -19 - 19i0 + 57e1 and 57e4 <= 56e3 and 57e4 >= -56 + 56e3 and 27e4 >= -56 - 56i0 + 168e1 and 56e5 <= 31e4 and 56e5 >= -55 + 31e4 and 27e5 >= -31 - 31i0 + 93e1 and 31e6 <= 29e5 and 31e6 >= -30 + 29e5 and 27e6 >= -29 - 29i0 + 87e1 and 29e7 <= 2e6 and 29e7 >= -28 + 2e6 and 27e7 >= -2 - 2i0 + 6e1 and 144e8 <= i0 and 144e8 >= -143 + i0 and o1 <= 2 and o1 >= 0 and 24480e8 >= 170i0 - 3o0 - 170o1)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-31 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -31 + i0 and o0 <= 1869 and o0 >= 1757)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-31 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -31 + i0 and o0 <= 1756 and o0 >= 1700)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((3 + floor((143i0)/144) + 25*floor((143 + 143i0 + 27*floor((143i0)/144))/429))/28), e3 = floor((i2)/170): 3e2 = -48 + 53i0 + o1 - 51e0 - 3e1 and 510e3 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 137e1 >= 138 - 199i0 + 255e0 and 24480e0 <= -24480 + 24310i0 + 3o0 + 170o1 and 3e1 <= -23 + 28i0 - 27e0 and 7632e1 >= -64944 + 71089i0 + 1344o1 - 68544e0 and 7632e1 <= -60913 + 71089i0 + 1344o1 - 68544e0 and 24480e0 >= -24987 + 24310i0 + 3o0 + 170o1 and 3e1 >= -25 + 28i0 - 27e0 and 426e1 <= 142 + 115i0 + 54e0 and 142e1 <= 142 - 57i0 + 114e0 and 159e1 <= -1272 + 1484i0 + 28o1 - 1431e0 and 159e1 >= -1353 + 1484i0 + 28o1 - 1431e0 and 137e1 <= 193 - 199i0 + 255e0 and o1 <= 2 and o1 >= 0 and 144e0 >= -143 + 143i0 and 144e0 <= 143i0)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-1288 + 1568i0 - 1512*floor((143i0)/144))/171), e4 = floor((-26 + 34i0)/3), e5 = floor((28i0)/3), e6 = floor((25i0)/3), e7 = floor((-14 + 19i0)/3), e8 = floor((-17 + 26i0)/3), e9 = floor((2 + 2i0)/3), e10 = floor((i2)/170), e11 = floor((2i0)/3), e12 = floor((-1 + i0)/3): o1 = 2 and 3e1 = -20 + 28i0 - 27e0 and 3e2 = -23 + 28i0 - 27e0 and 3e3 = -23 + 25i0 - 24e0 and 3e4 = -26 + 34i0 and 3e5 = -2 + 28i0 and 3e6 = -2 + 25i0 and 3e7 = -14 + 19i0 and 3e8 = -19 + 26i0 and 3e9 = 2 + 2i0 and 510e10 = 24140 - 24310i0 + 3i2 - 3o0 + 24480e0 and 3e11 = -1 + 2i0 and 3e12 = -2 + i0 and 144e0 <= -102 + 143i0 and 24480e0 <= -24482 + 24310i0 + 3o0 and 144e0 >= -107 + 143i0 and 24480e0 >= -24647 + 24310i0 + 3o0)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-1288 + 1568i0 - 1512*floor((143i0)/144))/171), e4 = floor((-26 + 34i0)/3), e5 = floor((28i0)/3), e6 = floor((25i0)/3), e7 = floor((-14 + 19i0)/3), e8 = floor((-17 + 26i0)/3), e9 = floor((2 + 2i0)/3), e10 = floor((i2)/170): o1 = 2 and 3e1 = -20 + 28i0 - 27e0 and 3e2 = -23 + 28i0 - 27e0 and 3e3 = -23 + 25i0 - 24e0 and 3e4 = -26 + 34i0 and 3e5 = -2 + 28i0 and 3e6 = -2 + 25i0 and 3e7 = -14 + 19i0 and 3e8 = -19 + 26i0 and 3e9 = 2 + 2i0 and 510e10 = 24140 - 24310i0 + 3i2 - 3o0 + 24480e0 and 24480e0 <= -24140 + 24310i0 + 3o0 and 24480e0 >= -24479 + 24310i0 + 3o0 and 144e0 >= -107 + 143i0 and 144e0 <= -102 + 143i0)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-1344 + 1568i0 - 1512*floor((143i0)/144))/171), e4 = floor((34i0)/3), e5 = floor((-19 + 28i0)/3), e6 = floor((-19 + 25i0)/3), e7 = floor((19i0)/3), e8 = floor((17i0)/3), e9 = floor((-25 + 34i0)/3), e10 = floor((-40 + 50i0)/3), e11 = floor((2 + 2i0)/3), e12 = floor((i2)/170): o1 = 0 and 3e1 = -21 + 28i0 - 27e0 and 3e2 = -24 + 28i0 - 27e0 and 3e3 = -24 + 25i0 - 24e0 and 3e4 = 34i0 and 3e5 = -21 + 28i0 and 3e6 = -21 + 25i0 and 3e7 = 19i0 and 3e8 = 17i0 and 3e9 = -27 + 34i0 and 3e10 = -42 + 50i0 and 3e11 = 2i0 and 510e12 = 24480 - 24310i0 + 3i2 - 3o0 + 24480e0 and 144e0 >= -111 + 143i0 and 144e0 <= -107 + 143i0 and 24480e0 <= -24480 + 24310i0 + 3o0 and 24480e0 >= -24987 + 24310i0 + 3o0)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-68 + i0)/144): o1 = 2 and 170e0 = -i2 + o0 and 144e1 = -68 + i0 and o0 <= 3909 and o0 >= 3854)) or (exists (e0 = floor((-68 + i0)/144), e1 = floor((-i2 + o0)/170), e2 = floor((-2 + o1)/3): 144e0 = -68 + i0 and 170e1 = -i2 + o0 and 3e2 = -2 + o1 and o0 <= 3853 and o1 <= 3 and o1 >= 2 and o0 >= 3740)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-168 - 56i0 + 168*floor((143i0)/144))/285), e4 = floor((-47 - 93i0 + 124*floor((143i0)/144))/140), e5 = floor((-244 + 407i0 - 231*floor((143i0)/144))/465), e6 = floor((-114 + 72i0 - 66*floor((143i0)/144))/31), e7 = floor((-437 + 311i0 - 138*floor((143i0)/144))/150), e8 = floor((2 + 2i0)/3), e9 = floor((i2)/170): 5e1 = 2 - i0 + 3e0 and 5e2 = -3 - i0 + 3e0 and 5e3 = -3 - 6i0 + 8e0 and 5e4 = -19 + 12i0 - 11e0 and 5e5 = 7 - 11i0 + 13e0 and 5e6 = -19 + 7i0 - 6e0 and 15e7 = -219 + 217i0 + 5o1 - 201e0 and 3e8 = 2i0 + o1 and 510e9 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 1872e0 >= -1753 + 1859i0 + 50o1 and 144e0 <= -121 + 143i0 and 1872e0 <= -1608 + 1859i0 + 50o1 and 144e0 >= -131 + 143i0 and 24480e0 >= -24987 + 24310i0 + 3o0 + 170o1 and 2304e0 <= -1901 + 2288i0 + 35o1 and o1 <= 2 and o1 >= 0 and 24480e0 <= -24480 + 24310i0 + 3o0 + 170o1)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-28 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -28 + i0 and o0 >= 1587 and o0 <= 1699)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-28 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -28 + i0 and o0 <= 1586 and o0 >= 1530)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-1176 + 1568i0 - 1512*floor((143i0)/144))/171), e4 = floor((-14 + 22i0)/3), e5 = floor((31i0)/3), e6 = floor((1 + i0)/3), e7 = floor((2 + 2i0)/3), e8 = floor((i2)/170): o1 = 0 and 3e1 = -18 + 28i0 - 27e0 and 3e2 = -21 + 28i0 - 27e0 and 3e3 = -21 + 25i0 - 24e0 and 3e4 = -15 + 22i0 and 3e5 = 31i0 and 3e6 = i0 and 3e7 = 2i0 and 510e8 = 24480 - 24310i0 + 3i2 - 3o0 + 24480e0 and 144e0 <= -93 + 143i0 and 24480e0 >= -24987 + 24310i0 + 3o0 and 24480e0 <= -24480 + 24310i0 + 3o0 and 144e0 >= -97 + 143i0)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-1232 + 1568i0 - 1512*floor((143i0)/144))/171), e4 = floor((22i0)/3), e5 = floor((-22 + 31i0)/3), e6 = floor((1 + i0)/3), e7 = floor((2 + i0)/3), e8 = floor((-10 + 22i0)/3), e9 = floor((-22 + 40i0)/3), e10 = floor((1 + 2i0)/3), e11 = floor((2 + 2i0)/3), e12 = floor((170 + 340i0 + 3i2)/510), e13 = floor((2i0)/3), e14 = floor((-1 + i0)/3): o1 = 1 and 3e1 = -19 + 28i0 - 27e0 and 3e2 = -22 + 28i0 - 27e0 and 3e3 = -22 + 25i0 - 24e0 and 3e4 = -1 + 22i0 and 3e5 = -22 + 31i0 and 3e6 = -1 + i0 and 3e7 = 2 + i0 and 3e8 = -10 + 22i0 and 3e9 = -22 + 40i0 and 3e10 = 1 + 2i0 and 3e11 = 1 + 2i0 and 170e12 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 3e13 = -2 + 2i0 and 3e14 = -1 + i0 and 144e0 >= -102 + 143i0 and 24480e0 <= -24481 + 24310i0 + 3o0 and 144e0 <= -98 + 143i0 and 24480e0 >= -24817 + 24310i0 + 3o0)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-1232 + 1568i0 - 1512*floor((143i0)/144))/171), e4 = floor((22i0)/3), e5 = floor((-22 + 31i0)/3), e6 = floor((1 + i0)/3), e7 = floor((2 + i0)/3), e8 = floor((-10 + 22i0)/3), e9 = floor((-22 + 40i0)/3), e10 = floor((1 + 2i0)/3), e11 = floor((2 + 2i0)/3), e12 = floor((170 + 340i0 + 3i2)/510): o1 = 1 and 3e1 = -19 + 28i0 - 27e0 and 3e2 = -22 + 28i0 - 27e0 and 3e3 = -22 + 25i0 - 24e0 and 3e4 = -1 + 22i0 and 3e5 = -22 + 31i0 and 3e6 = -1 + i0 and 3e7 = 2 + i0 and 3e8 = -10 + 22i0 and 3e9 = -22 + 40i0 and 3e10 = 1 + 2i0 and 3e11 = 1 + 2i0 and 170e12 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 24480e0 <= -24310 + 24310i0 + 3o0 and 24480e0 >= -24479 + 24310i0 + 3o0 and 144e0 <= -98 + 143i0 and 144e0 >= -102 + 143i0)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-53 + i0)/144): o1 = 2 and 170e0 = -i2 + o0 and 144e1 = -53 + i0 and o0 <= 3059 and o0 >= 3004)) or (exists (e0 = floor((-53 + i0)/144), e1 = floor((-i2 + o0)/170), e2 = floor((-2 + o1)/3): 144e0 = -53 + i0 and 170e1 = -i2 + o0 and 3e2 = -2 + o1 and o0 >= 2890 and o1 <= 3 and o0 <= 3003 and o1 >= 2)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-58 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -58 + i0 and o0 >= 3287 and o0 <= 3399)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-58 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -58 + i0 and o0 >= 3230 and o0 <= 3286)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-63 + i0)/144): o1 = 0 and 170e0 = -i2 + o0 and 144e1 = -63 + i0 and o0 <= 3739 and o0 >= 3570)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-50 + i0)/144): o1 = 2 and 170e0 = -i2 + o0 and 144e1 = -50 + i0 and o0 >= 2834 and o0 <= 2889)) or (exists (e0 = floor((3i2)/170), e1 = floor((-50 + i0)/144), e2 = floor((-i2 + o0)/170), e3 = floor((-2 + o1)/3): 144e1 = -50 + i0 and 170e2 = -i2 + o0 and 3e3 = -2 + o1 and o0 >= 2720 and o0 <= 2833 and 170e0 >= -169 + 3i2 and 170e0 >= 8500 + 3i2 - 3o0 - 170o1 and 170e0 <= 8925 + 3i2 - 3o0 - 170o1 and 170e0 <= 3i2)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-9 - 115i0 + o1)/144): 170e0 = -i2 + o0 and 144e1 = -9 - 115i0 + o1 and o1 >= 0 and 340o1 <= 2719 - o0 and o1 <= 1 and 340o1 >= 2550 - o0)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-35 + i0)/144): o1 = 2 and 170e0 = -i2 + o0 and 144e1 = -35 + i0 and o0 <= 2039 and o0 >= 1984)) or (exists (e0 = floor((-35 + i0)/144), e1 = floor((-i2 + o0)/170), e2 = floor((-2 + o1)/3): 144e0 = -35 + i0 and 170e1 = -i2 + o0 and 3e2 = -2 + o1 and o1 <= 3 and o0 >= 1870 and o0 <= 1983 and o1 >= 2)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-10 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -10 + i0 and o0 >= 567 and o0 <= 679)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-10 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -10 + i0 and o0 <= 566 and o0 >= 510)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-25 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -25 + i0 and o0 >= 1417 and o0 <= 1529)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-25 + i0)/144): o1 = 1 and 170e0 = -i2 + o0 and 144e1 = -25 + i0 and o0 <= 1416 and o0 >= 1360)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((-57i0 + 114*floor((143i0)/144))/142), e3 = floor((-112 - 56i0 + 168*floor((143i0)/144))/285), e4 = floor((1 - 11i0 + 24*floor((143i0)/144))/21), e5 = floor((-11 - 33i0 + 44*floor((143i0)/144))/28), e6 = floor((31 - 77i0 + 156*floor((143i0)/144))/165), e7 = floor((-108 - 594i0 + 702*floor((143i0)/144))/275), e8 = floor((-8 - 64i0 + 72*floor((143i0)/144))/27), e9 = floor((1 + i0)/3), e10 = floor((-1794 + 728i0 - 65o1 - 624*floor((143i0)/144))/400), e11 = floor((2 + 2i0)/3), e12 = floor((i2)/170): 5e1 = 3 - i0 + 3e0 and 5e2 = -2 - i0 + 3e0 and 5e3 = -2 - 6i0 + 8e0 and 5e4 = -13 + 11i0 - 8e0 and 5e5 = -2 - 11i0 + 13e0 and 30e6 = -54 + 38i0 - 5o1 - 24e0 and 5e7 = -2 - 16i0 + 18e0 and 30e8 = -138 + 56i0 - 5o1 - 48e0 and 6e9 = 2i0 + o1 and 30e10 = -138 + 26i0 - 5o1 - 18e0 and 3e11 = 2i0 + o1 and 510e12 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 24480e0 >= -24987 + 24310i0 + 3o0 + 170o1 and 576e0 >= -536 + 572i0 - 45o1 and o1 <= 2 and 1152e0 <= -902 + 1144i0 - 45o1 and 24480e0 <= -24480 + 24310i0 + 3o0 + 170o1 and o1 >= 0 and 144e0 <= -111 + 143i0 and 1152e0 <= -912 + 1144i0 + 5o1 and 1152e0 >= -1032 + 1144i0 + 5o1)) or (exists (e0 = floor((-i2 + o0)/170), e1 = floor((-12 - 115i0 + o1)/144): 170e0 = -i2 + o0 and 144e1 = -12 - 115i0 + o1 and o1 <= 1 and 340o1 <= 3569 - o0 and o1 >= 0 and 340o1 >= 3400 - o0)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((71 - 28i0 + 198*floor((143i0)/144))/213), e3 = floor((9i0 + 10*floor((143i0)/144))/47), e4 = floor((248 + 29i0 + 198*floor((143i0)/144))/285), e5 = floor((81 + 103i0 + 96*floor((143i0)/144))/140), e6 = floor((15 - 5i0 + 17*floor((143i0)/144))/29), e7 = floor((1 + i0)/3), e8 = floor((2 + 2i0)/3), e9 = floor((-782 + 1734i0 + i2 - 1632*floor((143i0)/144))/170), e10 = floor((2i0)/3), e11 = floor((-28 + 51i0 - 48*floor((143i0)/144))/10): 5e1 = 3 - i0 + 3e0 and 5e2 = 1 - 2i0 + 6e0 and 5e3 = -2 + 4i0 - 2e0 and 5e4 = 1 - 2i0 + 6e0 and 5e5 = -7 + 19i0 - 12e0 and 5e6 = -7 + 9i0 - 7e0 and 5e7 = -51 + 97i0 - 96e0 and 5e8 = -23 + 51i0 - 48e0 and 170e9 = 8160 - 7990i0 + i2 - o0 + 8160e0 and 5e10 = -28 + 51i0 - 48e0 and 3e11 = i0 - o1 and 9792e0 >= -9111 + 9724i0 + o0 and 24480e0 <= -24481 + 24310i0 + 3o0 and 144e0 >= -79 + 143i0 and 144e0 <= -74 + 143i0 and 9792e0 <= -8942 + 9724i0 + o0 and 144e0 >= -99 + 143i0 + 10o1 and 144e0 <= -84 + 143i0 + 10o1)) or (exists (e0 = floor((3i2)/170), e1 = floor((-i2 + o0)/170), e2 = floor((-15 - 115i0 + o1)/144): 170e1 = -i2 + o0 and 144e2 = -15 - 115i0 + o1 and 340o1 <= 4419 - o0 and 850o1 <= 12750 - 3o0 and 170e0 <= 3i2 and o1 <= 2 and 170e0 >= -169 + 3i2 and o1 > 0 and 340o1 >= 4250 - o0 and 170e0 >= 12750 + 3i2 - 3o0 - 1020o1 and 170e0 <= 12920 + 3i2 - 3o0 - 1020o1)) or (exists (e0 = floor((143i0)/144), e1 = floor((143 + 143i0 + 27*floor((143i0)/144))/429), e2 = floor((71 - 28i0 + 198*floor((143i0)/144))/213), e3 = floor((9i0 + 10*floor((143i0)/144))/47), e4 = floor((269 + 29i0 + 198*floor((143i0)/144))/285), e5 = floor((103 + 103i0 + 96*floor((143i0)/144))/140), e6 = floor((191 + 131i0 + 177*floor((143i0)/144))/155), e7 = floor((-258 + 497i0 + 10o1 - 321*floor((143i0)/144))/435), e8 = floor((2 + 2i0)/3), e9 = floor((i2)/170): 5e1 = 4 - i0 + 3e0 and 5e2 = -2 - 2i0 + 6e0 and 5e3 = -6 + 4i0 - 2e0 and 5e4 = 3 - 2i0 + 6e0 and 5e5 = -11 + 19i0 - 12e0 and 15e6 = -234 + 271i0 + 5o1 - 243e0 and 5e7 = -6 + 9i0 - 7e0 and 3e8 = 2i0 + o1 and 510e9 = 24480 - 24310i0 + 3i2 - 3o0 - 170o1 + 24480e0 and 144e0 >= -137 + 143i0 and 24480e0 >= -24987 + 24310i0 + 3o0 + 170o1 and 144e0 <= -127 + 143i0 and 24480e0 <= -24480 + 24310i0 + 3o0 + 170o1 and 144e0 <= 78 + 143i0 - 5o1 and o1 <= 2 and o1 >= 0 and 144e0 >= -132 + 143i0 - 5o1 and 8064e0 <= -7377 + 8008i0 + 155o1 and 8064e0 >= -7827 + 8008i0 + 155o1)) }

AFAIU this changes isl_map_lexmin to be called after applying the schedule, instead of before.

What I don't understand where the compile-time improvement comes from. Does isl_map_lexmin spend that much time? Why is it different depending on when it is applied? Are there other cases where calling isl_map_lexmin late will take more time than when applied early?

Maybe the problem is that the access relation needs to be simplified. Try calling isl_map_compute_divs, isl_map_detect_equalities and isl_map_coalesce on it before isl_map_lexmin. This already helped me in one case.

test/Isl/CodeGen/pattern-matching-based-opts.ll
13–63 ↗	(On Diff #81843)	Wouldn't it make more sense to check for the access relation instead for the generated code? I mean, there are a lot suffix digits in the checks which can change easily with otherwise unrelated changes, including with LLVM itself. The change is also unrelated to matrix-multiplication optimization. Is there a simpler test case? If I understand your summary correctly, this is only a compile-time optimization. How can there be a test case?

In D27878#625759, @gareevroman wrote:

For example, getAddressFunction() can return the following map . Subsequently, Memory::applyScheduleToAccessRelation tries to apply the schedule to its domain.

{ CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1] : (exists (e0 = floor((143i0)/144), [snip]

What is the access relation it is derived from? If it is that complex as well, we should consider ensuring that we do not generate that complicated access functions.

Hi Michael,

thanks for the comments!

I've updated the patch. It adds the third dimension to an array that stores a packed operand of the matrix multiplication, which, as far as I understand, helps to simplify the corresponding memory access and avoid the issue with isl_map_lexmin.

AFAIU this changes isl_map_lexmin to be called after applying the schedule, instead of before.

Sure.

What I don't understand where the compile-time improvement comes from. Does isl_map_lexmin spend that much time? Why is it different depending on when it is applied? Are there other cases where calling isl_map_lexmin late will take more time than when applied early?

Since the simplification of the memory accesses helps to avoid the issue, I propose to postpone the discussion.

Maybe the problem is that the access relation needs to be simplified. Try calling isl_map_compute_divs, isl_map_detect_equalities and isl_map_coalesce on it before isl_map_lexmin. This already helped me in one case.

It doesn't help in this case. However, thanks for the advice.

Wouldn't it make more sense to check for the access relation instead for the generated code?

I mean, there are a lot suffix digits in the checks which can change easily with otherwise unrelated changes, including with LLVM itself.

Yes, we could probably check the access relation, if it could be incorrect.

The change is also unrelated to matrix-multiplication optimization. Is there a simpler test case?

I think that the new version of the patch doesn't require a new test case.

If I understand your summary correctly, this is only a compile-time optimization. How can there be a test case?

Right. It probably makes sense to provide more details about the reason of the compile-time regression.

What is the access relation it is derived from?

It represents an access to the A, the first operand of the matrix multiplication. It also requires the specific values of LatencyVectorFma, ThroughputVectorFma, CacheLevelAssociativity, and CacheLevelSizes (e.g., 7, 1, 8, 8, 32768, 262144, respectively) to reproduce the issue.

If it is that complex as well, we should consider ensuring that we do not generate that complicated access functions.

Right.

Interesting.

Could you add some more information to the commit message:

Give before/after compile times for a specific example
Explain why we previously had 2D accesses

Also, I see that besides the helper function, no comments changed. It would be good to add documentation how the packing transformation is implemented. Possibly with a simple example.

Also, as Michael suggested, we should consider factoring a separate function that implements packing and is only paramterized for the A/B array.

Obviously, the refactoring is not a requirement for this patch to in and should better be done separately. If it is easy, I suggest to do it first.

Accept for the patch itself. Tobias suggested a more detailed commit message.

lib/Transform/ScheduleOptimizer.cpp
813–817 ↗	(On Diff #81929)	That's nice, it even make the code simpler.

This revision is now accepted and ready to land.Dec 19 2016, 2:28 AM

Could you add some more information to the commit message:

Give before/after compile times for a specific example
Explain why we previously had 2D accesses

Hi Tobias,

I've updated the commit message.

Also, I see that besides the helper function, no comments changed. It would be good to add documentation how the packing transformation is implemented.

I think that the comment to optimizeDataLayoutMatrMulPattern gives a brief overview of how the packing transformation is implemented. What do you think about it?

Possibly with a simple example.

What do you think about the following example:

"As an example let us consider the packing of the array A, which is represented by an assess relation that has the form { Stmt_for_body6[i0, i1, i2] -> [o0, o1, o2, o3, o4, o5, o6, o7, 0] : 256*floor((-i2 + o5)/256) = -i2 + o5 and 8*floor((-i1 + o7)/8) = -i1 + o7 and 4*floor((i0 - o6)/4) = i0 - o6 and -15 + i1 <= 16o0 <= i1 and -255 + i2 <= 256o1 <= i2 and -95 + i0 <= 96o2 <= i0 and 0 <= o5 <= 255 and 0 <= o6 <= 3 and 0 <= o7 <= 7 and -7 + i1 - 8o3 <= 16*floor((i1)/16) <= i1 - 8o3 and -3 + i0 - 4o4 <= 96*floor((i0)/96) <= i0 - 4o4 }

To add a new array Packed_A[24][256][4] to the SCoP, we use Scop::createScopArrayInfo(). Copying of data to the array is performed by the copy statement created by Scop::addScopStmt. Changing of the memory access locations to {Stmt_for_body6[i0, i1, i2] -> [o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 4*floor((-i0 + o2)/4) = -i0 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 3 and -3 + i0 - 4o0 <= 96*floor((i0)/96) <= i0 - 4o0} is performed by MemoryAccess::setNewAccessRelation."

Probably, it makes sense to commit such a comment separately.

Also, as Michael suggested, we should consider factoring a separate function that implements packing and is only paramterized for the A/B array.

Obviously, the refactoring is not a requirement for this patch to in and should better be done separately. If it is easy, I suggest to do it first.

Yes, the refactoring is probably required since r278666. At the moment, it seems that a parametrized packing would have too many parameters (e.g., parameters of new memory accesses, domains of copy statements, sizes of dimensions of newly created arrays).

I'll fix a few issues related to the optimization and the identification of the matrix multiplication and start working on the refactoring.

In D27878#626450, @gareevroman wrote:

Could you add some more information to the commit message:

Give before/after compile times for a specific example
Explain why we previously had 2D accesses

Hi Tobias,

I've updated the commit message.

Nice. This looks a lot better. Feel free to commit.

Also, I see that besides the helper function, no comments changed. It would be good to add documentation how the packing transformation is implemented.

I think that the comment to optimizeDataLayoutMatrMulPattern gives a brief overview of how the packing transformation is implemented. What do you think about it?

Possibly with a simple example.

What do you think about the following example:

"As an example let us consider the packing of the array A, which is represented by an assess relation that has the form { Stmt_for_body6[i0, i1, i2] -> [o0, o1, o2, o3, o4, o5, o6, o7, 0] : 256*floor((-i2 + o5)/256) = -i2 + o5 and 8*floor((-i1 + o7)/8) = -i1 + o7 and 4*floor((i0 - o6)/4) = i0 - o6 and -15 + i1 <= 16o0 <= i1 and -255 + i2 <= 256o1 <= i2 and -95 + i0 <= 96o2 <= i0 and 0 <= o5 <= 255 and 0 <= o6 <= 3 and 0 <= o7 <= 7 and -7 + i1 - 8o3 <= 16*floor((i1)/16) <= i1 - 8o3 and -3 + i0 - 4o4 <= 96*floor((i0)/96) <= i0 - 4o4 }

Why is the _access_ relation mapping to a 9 dimensional space. You talk about a _schedule_ here, right?

To add a new array Packed_A[24][256][4] to the SCoP, we use Scop::createScopArrayInfo(). Copying of data to the array is performed by the copy statement created by Scop::addScopStmt. Changing of the memory access locations to {Stmt_for_body6[i0, i1, i2] -> [o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 4*floor((-i0 + o2)/4) = -i0 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 3 and -3 + i0 - 4o0 <= 96*floor((i0)/96) <= i0 - 4o0} is performed by MemoryAccess::setNewAccessRelation."

The access relation seems to implement a modulo constraint. It probably is easier to read if we write the modulo explicitly. I suggest to just submit a patch with this comment, then Michael and me can read over it and see if we still have some comments.

Probably, it makes sense to commit such a comment separately.

Right.

Also, as Michael suggested, we should consider factoring a separate function that implements packing and is only paramterized for the A/B array.

Obviously, the refactoring is not a requirement for this patch to in and should better be done separately. If it is easy, I suggest to do it first.

Yes, the refactoring is probably required since r278666. At the moment, it seems that a parametrized packing would have too many parameters (e.g., parameters of new memory accesses, domains of copy statements, sizes of dimensions of newly created arrays).

I'll fix a few issues related to the optimization and the identification of the matrix multiplication and start working on the refactoring.

Cool.

Closed by commit rL290251: [Polly] Use three-dimensional arrays to store packed operands of the matrix (authored by romangareev). · Explain WhyDec 21 2016, 3:29 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

polly/

trunk/

lib/

Transform/

ScheduleOptimizer.cpp

44 lines

test/

ScheduleOptimizer/

mat_mul_pattern_data_layout.ll

12 lines

Diff 82221

polly/trunk/lib/Transform/ScheduleOptimizer.cpp

Show First 20 Lines • Show All 779 Lines • ▼ Show 20 Lines	MemoryAccess identifyAccessB(ScopStmt Stmt) {
auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);		auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
return identifyAccessByAccessRelation(Stmt, OriginalRel);		return identifyAccessByAccessRelation(Stmt, OriginalRel);
}		}

/// Create an access relation that is specific to		/// Create an access relation that is specific to
/// the matrix multiplication pattern.		/// the matrix multiplication pattern.
///		///
/// Create an access relation of the following form:		/// Create an access relation of the following form:
/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [O5 + K * OI, OJ],		/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
/// where K is @p Coeff, I is @p FirstDim, J is @p SecondDim.		/// where I is @p FirstDim, J is @p SecondDim.
///		///
/// It can be used, for example, to create relations that helps to consequently		/// It can be used, for example, to create relations that helps to consequently
/// access elements of operands of a matrix multiplication after creation of		/// access elements of operands of a matrix multiplication after creation of
/// the BLIS micro and macro kernels.		/// the BLIS micro and macro kernels.
///		///
/// @see ScheduleTreeOptimizer::createMicroKernel		/// @see ScheduleTreeOptimizer::createMicroKernel
/// @see ScheduleTreeOptimizer::createMacroKernel		/// @see ScheduleTreeOptimizer::createMacroKernel
///		///
/// Subsequently, the described access relation is applied to the range of		/// Subsequently, the described access relation is applied to the range of
/// @p MapOldIndVar, that is used to map original induction variables to		/// @p MapOldIndVar, that is used to map original induction variables to
/// the ones, which are produced by schedule transformations. It helps to		/// the ones, which are produced by schedule transformations. It helps to
/// define relations using a new space and, at the same time, keep them		/// define relations using a new space and, at the same time, keep them
/// in the original one.		/// in the original one.
///		///
/// @param MapOldIndVar The relation, which maps original induction variables		/// @param MapOldIndVar The relation, which maps original induction variables
/// to the ones, which are produced by schedule		/// to the ones, which are produced by schedule
/// transformations.		/// transformations.
/// @param Coeff The coefficient that is used to define the specified access
/// relation.
/// @param FirstDim, SecondDim The input dimensions that are used to define		/// @param FirstDim, SecondDim The input dimensions that are used to define
/// the specified access relation.		/// the specified access relation.
/// @return The specified access relation.		/// @return The specified access relation.
__isl_give isl_map getMatMulAccRel(__isl_take isl_map MapOldIndVar,		__isl_give isl_map getMatMulAccRel(__isl_take isl_map MapOldIndVar,
unsigned Coeff, unsigned FirstDim,		unsigned FirstDim, unsigned SecondDim) {
unsigned SecondDim) {
auto *Ctx = isl_map_get_ctx(MapOldIndVar);		auto *Ctx = isl_map_get_ctx(MapOldIndVar);
auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 2);		auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
auto *AccessRel = isl_map_universe(isl_space_copy(AccessRelSpace));		auto *AccessRel = isl_map_universe(AccessRelSpace);
auto *ConstrSpace = isl_local_space_from_space(AccessRelSpace);		AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
auto *Constr = isl_constraint_alloc_equality(ConstrSpace);		AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, 0, -1);		AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, 5, 1);
Constr =
isl_constraint_set_coefficient_si(Constr, isl_dim_in, FirstDim, Coeff);
AccessRel = isl_map_add_constraint(AccessRel, Constr);
AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 1);
return isl_map_apply_range(MapOldIndVar, AccessRel);		return isl_map_apply_range(MapOldIndVar, AccessRel);
}		}

__isl_give isl_schedule_node *		__isl_give isl_schedule_node *
createExtensionNode(__isl_take isl_schedule_node *Node,		createExtensionNode(__isl_take isl_schedule_node *Node,
__isl_take isl_map *ExtensionMap) {		__isl_take isl_map *ExtensionMap) {
auto *Extension = isl_union_map_from_map(ExtensionMap);		auto *Extension = isl_union_map_from_map(ExtensionMap);
auto *NewNode = isl_schedule_node_from_extension(Extension);		auto *NewNode = isl_schedule_node_from_extension(Extension);
Show All 30 Lines	static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
}		}

// Create a copy statement that corresponds to the memory access to the		// Create a copy statement that corresponds to the memory access to the
// matrix B, the second operand of the matrix multiplication.		// matrix B, the second operand of the matrix multiplication.
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));		Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));		Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
Node = isl_schedule_node_parent(Node);		Node = isl_schedule_node_parent(Node);
Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);		Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
auto *AccRel =		auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 7);		unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
unsigned FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr;		unsigned SecondDimSize = MacroParams.Kc;
unsigned SecondDimSize = MicroParams.Nr;		unsigned ThirdDimSize = MicroParams.Nr;
auto *SAI = Stmt->getParent()->createScopArrayInfo(		auto *SAI = Stmt->getParent()->createScopArrayInfo(
MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize});		MemAccessB->getElementType(), "Packed_B",
		{FirstDimSize, SecondDimSize, ThirdDimSize});
AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());		AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
auto *OldAcc = MemAccessB->getAccessRelation();		auto *OldAcc = MemAccessB->getAccessRelation();
MemAccessB->setNewAccessRelation(AccRel);		MemAccessB->setNewAccessRelation(AccRel);
auto *ExtMap =		auto *ExtMap =
getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);		getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);		isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);		isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);		ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
auto *Domain = Stmt->getDomain();		auto *Domain = Stmt->getDomain();

// Restrict the domains of the copy statements to only execute when also its		// Restrict the domains of the copy statements to only execute when also its
// originating statement is executed.		// originating statement is executed.
auto *DomainId = isl_set_get_tuple_id(Domain);		auto *DomainId = isl_set_get_tuple_id(Domain);
auto *NewStmt = Stmt->getParent()->addScopStmt(		auto *NewStmt = Stmt->getParent()->addScopStmt(
OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));		OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));		ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));		ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());		ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
Node = createExtensionNode(Node, ExtMap);		Node = createExtensionNode(Node, ExtMap);

// Create a copy statement that corresponds to the memory access		// Create a copy statement that corresponds to the memory access
// to the matrix A, the first operand of the matrix multiplication.		// to the matrix A, the first operand of the matrix multiplication.
Node = isl_schedule_node_child(Node, 0);		Node = isl_schedule_node_child(Node, 0);
AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 6);		AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr;		FirstDimSize = MacroParams.Mc / MicroParams.Mr;
SecondDimSize = MicroParams.Mr;		ThirdDimSize = MicroParams.Mr;
SAI = Stmt->getParent()->createScopArrayInfo(		SAI = Stmt->getParent()->createScopArrayInfo(
MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize});		MemAccessA->getElementType(), "Packed_A",
		{FirstDimSize, SecondDimSize, ThirdDimSize});
AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());		AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
OldAcc = MemAccessA->getAccessRelation();		OldAcc = MemAccessA->getAccessRelation();
MemAccessA->setNewAccessRelation(AccRel);		MemAccessA->setNewAccessRelation(AccRel);
ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);		ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);		isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);		isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
NewStmt = Stmt->getParent()->addScopStmt(		NewStmt = Stmt->getParent()->addScopStmt(
OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));		OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
▲ Show 20 Lines • Show All 358 Lines • Show Last 20 Lines

polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll

	; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 -polly-optimized-scops < %s 2>&1 \| FileCheck %s			; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 -polly-optimized-scops < %s 2>&1 \| FileCheck %s
	;			;
	; /* C := alphaAB + betaC /			; /* C := alphaAB + betaC /
	; for (i = 0; i < _PB_NI; i++)			; for (i = 0; i < _PB_NI; i++)
	; for (j = 0; j < _PB_NJ; j++)			; for (j = 0; j < _PB_NJ; j++)
	; {			; {
	; C[i][j] *= beta;			; C[i][j] *= beta;
	; for (k = 0; k < _PB_NK; ++k)			; for (k = 0; k < _PB_NK; ++k)
	; C[i][j] += alpha * A[i][k] * B[k][j];			; C[i][j] += alpha * A[i][k] * B[k][j];
	; }			; }
	;			;
	; CHECK: double Packed_B[ { [] -> [(512)] } ][ { [] -> [(8)] } ]; // Element size 8			; CHECK: double Packed_B[ { [] -> [(2)] } ][ { [] -> [(256)] } ][ { [] -> [(8)] } ]; // Element size 8
	; CHECK-NEXT: double Packed_A[ { [] -> [(6144)] } ][ { [] -> [(4)] } ]; // Element size 8			; CHECK-NEXT: double Packed_A[ { [] -> [(24)] } ][ { [] -> [(256)] } ][ { [] -> [(4)] } ]; // Element size 8
	;			;
	; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] };			; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] };
	; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1] : 256floor((-i2 + o0)/256) = -i2 + o0 and 4floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96floor((i0)/96) <= 4floor((o0)/256) <= i0 - 96*floor((i0)/96) };			; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1, o2] : 256floor((-i2 + o1)/256) = -i2 + o1 and 4floor((-i0 + o2)/4) = -i0 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 3 and -3 + i0 - 4o0 <= 96*floor((i0)/96) <= i0 - 4o0 };
	;			;
	; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };			; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
	; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1] : 256floor((-i2 + o0)/256) = -i2 + o0 and 8floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16floor((i1)/16) <= 8floor((o0)/256) <= i1 - 16*floor((i1)/16) };			; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256floor((-i2 + o1)/256) = -i2 + o1 and 8floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 16*floor((i1)/16) <= i1 - 8o0 };
	;			;
	; CHECK: CopyStmt_0			; CHECK: CopyStmt_0
	; CHECK-NEXT: Domain :=			; CHECK-NEXT: Domain :=
	; CHECK-NEXT: { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };			; CHECK-NEXT: { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
	; CHECK-NEXT: Schedule :=			; CHECK-NEXT: Schedule :=
	; CHECK-NEXT: ;			; CHECK-NEXT: ;
	; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]			; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
	; CHECK-NEXT: null;			; CHECK-NEXT: null;
	; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1] : 256floor((-i2 + o0)/256) = -i2 + o0 and 8floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16floor((i1)/16) <= 8floor((o0)/256) <= i1 - 16*floor((i1)/16) };			; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256floor((-i2 + o1)/256) = -i2 + o1 and 8floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 16*floor((i1)/16) <= i1 - 8o0 };
	; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]			; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
	; CHECK-NEXT: null;			; CHECK-NEXT: null;
	; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };			; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
	; CHECK-NEXT: CopyStmt_1			; CHECK-NEXT: CopyStmt_1
	; CHECK-NEXT: Domain :=			; CHECK-NEXT: Domain :=
	; CHECK-NEXT: { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };			; CHECK-NEXT: { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
	; CHECK-NEXT: Schedule :=			; CHECK-NEXT: Schedule :=
	; CHECK-NEXT: ;			; CHECK-NEXT: ;
	; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]			; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
	; CHECK-NEXT: null;			; CHECK-NEXT: null;
	; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1] : 256floor((-i2 + o0)/256) = -i2 + o0 and 4floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96floor((i0)/96) <= 4floor((o0)/256) <= i0 - 96*floor((i0)/96) };			; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1, o2] : 256floor((-i2 + o1)/256) = -i2 + o1 and 4floor((-i0 + o2)/4) = -i0 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 3 and -3 + i0 - 4o0 <= 96*floor((i0)/96) <= i0 - 4o0 };
	; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]			; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
	; CHECK-NEXT: null;			; CHECK-NEXT: null;
	; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg6[i0, i2] };			; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg6[i0, i2] };
	;			;
	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-unknown"			target triple = "x86_64-unknown-unknown"

	define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {			define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {
	▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines