Changeset View
Standalone View
MicroBenchmarks/harris/main.cpp
/* For polymage-benchmarks-harris kernel | |||||
Copyright (c) 2015 Indian Institute of Science | |||||
All rights reserved. | |||||
Written and provided by: | |||||
Ravi Teja Mullapudi, Vinay Vasista, Uday Bondhugula | |||||
Dept of Computer Science and Automation | |||||
Indian Institute of Science | |||||
Bangalore 560012 | |||||
India | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in the | |||||
documentation and/or other materials provided with the distribution. | |||||
3. Neither the name of the Indian Institute of Science nor the | |||||
names of its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS MATERIAL IS PROVIDED BY Ravi Teja Mullapudi, Vinay Vasista, and Uday | |||||
Bondhugula, Indian Institute of Science ''AS IS'' AND ANY EXPRESS OR IMPLIED | |||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | |||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO | |||||
EVENT SHALL Ravi Teja Mullapudi, Vinay Vasista, CSA Indian Institute of | |||||
Science BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||||
POSSIBILITY OF SUCH DAMAGE. | |||||
*/ | |||||
// ============================================================================ | |||||
/* | |||||
* Pankaj Kukreja | |||||
* Indian Institute of Technology Hyderabad | |||||
* | |||||
* Acknowledgements | |||||
// ============================================================================ | |||||
* HARRIS KERNEL from Polymage benchmark (modified) | |||||
* File: polymage-benchmarks/apps/harris/harris_polymage_naive.cpp | |||||
*/ | |||||
// ============================================================================ | |||||
#include "harris.h" | |||||
int sum = 0; | |||||
#ifdef BENCHMARK_LIB | |||||
#include "benchmark/benchmark.h" | |||||
#endif | |||||
// This function initializes the input image to checkbox image | |||||
// Can be replaced with any other image initialization | |||||
void initCheckboardImage(int height, int width, | |||||
float image[(2 + HEIGHT)][2 + WIDTH]) { | |||||
int last_pixel_x = 0; | |||||
int last_pixel_y = 0; | |||||
for (int i = 0; i < height; i++) { | |||||
if (i % BOX_SIZE == 0) { | |||||
last_pixel_y = (last_pixel_y + 1) % 2; | |||||
} | |||||
last_pixel_x = last_pixel_y; | |||||
for (int j = 0; j < width; j++) { | |||||
if (j % BOX_SIZE == 0) { | |||||
last_pixel_x = (last_pixel_x + 1) % 2; | |||||
} | |||||
if (last_pixel_x == 0) { | |||||
image[i][j] = 255; | |||||
} else { | |||||
image[i][j] = 0; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
// Writes image matrix to a file. | |||||
void printImage(int height, int width, float arr[(2 + HEIGHT)][2 + WIDTH], | |||||
int dummy) { | |||||
std::ofstream myfile; | |||||
myfile.open("output.txt"); | |||||
homerdin: This will write `output.txt` into whichever directory lit is run in. You can set the working… | |||||
for (int i = 0; i < height - 2; i++) { | |||||
for (int j = 0; j < width - 2; j++) { | |||||
if (arr[i][j] < 0) { | |||||
myfile << 0; | |||||
} else if (arr[i][j] > 255) { | |||||
myfile << 3; | |||||
} else { | |||||
myfile << (int)(arr[i][j]); | |||||
} | |||||
} | |||||
myfile << "\n"; | |||||
} | |||||
// Dummy code to make sure the allocated ImageOutput Array is not optimized | |||||
// out | |||||
if (dummy > 0) { | |||||
myfile << sum; | |||||
} | |||||
} | |||||
#ifdef BENCHMARK_LIB | |||||
void BENCHMARK_HARRIS(benchmark::State &state) { | |||||
int height = state.range(0); | |||||
int width = state.range(1); | |||||
float(*image)[HEIGHT + 2][WIDTH + 2]; | |||||
image = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
initCheckboardImage((HEIGHT + 2), (WIDTH + 2), *image); | |||||
float(*imageOutput)[2 + HEIGHT][2 + WIDTH]; | |||||
imageOutput = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Not Done ReplyInline ActionsWhy are these still using HEIGHT and WIDTH? Why aren't these just: const size_t height = state.range(0); const size_t weight = state.range(1); float **image = reinterpret_cast<float**>(malloc(sizeof(float) * (2 + height) * (2 + width))); float **imageOutput = reinterpret_cast<float**>(malloc(sizeof(float) * (2 + height) * (2 + width))); dberris: Why are these still using `HEIGHT` and `WIDTH`? Why aren't these just:
```
const size_t height… | |||||
float(*Ix)[2 + HEIGHT][2 + WIDTH]; | |||||
There's a few comments I have about this code, but let me start with the simple(r) ones:
dberris: There's a few comments I have about this code, but let me start with the simple(r) ones:
- You… | |||||
float(*Iy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Ixx)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Ixy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Iyy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Sxx)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Sxy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Syy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*det)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*trace)[2 + HEIGHT][2 + WIDTH]; | |||||
Ix = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Iy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Ixx = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Ixy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Iyy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Sxx = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Sxy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Syy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
det = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
trace = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
harrisKernel(height, width, *image, *imageOutput, *Ix, *Iy, *Ixx, *Ixy, *Iyy, | |||||
*Sxx, *Sxy, *Syy, *det, *trace); | |||||
for (auto _ : state) { | |||||
harrisKernel(height, width, *image, *imageOutput, *Ix, *Iy, *Ixx, *Ixy, | |||||
*Iyy, *Sxx, *Sxy, *Syy, *det, *trace); | |||||
} | |||||
free((void *)Ix); | |||||
free((void *)Iy); | |||||
free((void *)Ixx); | |||||
free((void *)Ixy); | |||||
free((void *)Iyy); | |||||
free((void *)Sxx); | |||||
free((void *)Sxy); | |||||
free((void *)Syy); | |||||
free((void *)det); | |||||
free((void *)trace); | |||||
for (int i = 0; i < height + 2; i++) { | |||||
for (int j = 0; j < width + 2; j++) { | |||||
sum = (sum + 1) & (int)(*imageOutput)[i][j]; | |||||
} | |||||
} | |||||
state.SetBytesProcessed(sizeof(float) * (height + 2) * (width + 2) * | |||||
state.iterations()); | |||||
Not Done ReplyInline ActionsIt seems that HEIGHT and WIDTH are input values anyway, consider making multiple input sizes to see how the kernel performs as you scale the image size goes up. You might also not need the __restrict__ attributes for the malloc-provided heap memory either. This means you could do: float **image = reinterpret_cast<float**>(malloc(sizeof(float) * (2 + state.range(0)) * (2 + state.range(1)))); When you register the benchmark, you can then provide the image sizes to test with: BENCHMARK(HarrisBenchmark) ->Unit(benchmark::kMicrosecond) ->Args({256, 256}) ->Args({512, 512}) ->Args({1024, 1024}) ->Args({2048, 2048}); You can see more options at https://github.com/google/benchmark#passing-arguments. Another thing you may consider measuring as I suggested in the past is throughput. To do that, you can call state.SetBytesProcessed(...) in the benchmark body, typically at the end just before exiting -- you want to essentially report something like: state.SetBytesProcessed(sizeof(float) * (state.range(0) + 2) * (state.range(1) + 2) * state.iterations()); This will add a "MB/sec" output alongside the time it took for each iteration of the benchmark. dberris: It seems that HEIGHT and WIDTH are input values anyway, consider making multiple input sizes to… | |||||
Not Done ReplyInline ActionsCannot use float **image as pointers may overlap and this prevents Polly from detecting scops. I have to allocate the fixed size arrays here as "float (&outputImg)[2+height][2+width] = *reinterpret_cast<float (*)[2+height][2+width]>((float *) malloc(...)); " is not allowed by clang++ I did considered adding SetBytesProcessed but I was not sure how many bytes should be written as argument (output image size or the total bytes accessed in kernel) so I commented the line "SetBytesProcessed(static_cast<int64_t>(state.iterations())*WIDTH*HEIGHT*50);" but forgot to ask about it. proton: Cannot use float **image as pointers may overlap and this prevents Polly from detecting scops. | |||||
Not Done ReplyInline ActionsI don't know whether you want to optimise for Polly or make Polly just recognise these pointers shouldn't overlap. If Polly can't detect that these pointers are coming from different 'malloc' calls, then I suspect that's a bug in Polly rather than something you need to work around in the benchmark. Note that maybe the better thing to do is to change the kernel's API to put restrict or __restrict__ on the pointers, so that the optimiser in those cases might be able to assume that the pointers don't alias and don't do anything special in this benchmark. See my top-level comment for alternatives to explore, if you're open to it. dberris: I don't know whether you want to optimise for Polly or make Polly just recognise these pointers… | |||||
free((void *)imageOutput); | |||||
free((void *)image); | |||||
} | |||||
BENCHMARK(BENCHMARK_HARRIS) | |||||
->Args({256, 256}) | |||||
->Args({512, 512}) | |||||
Can you re-format this? Preferably with clang-format if possible, so that it's easier to read. dberris: Can you re-format this? Preferably with clang-format if possible, so that it's easier to read. | |||||
->Args({1024, 1024}) | |||||
->Args({2048, 2048}) | |||||
->Unit(benchmark::kMicrosecond); | |||||
#endif | |||||
int main(int argc, char *argv[]) { | |||||
sum = 1; | |||||
#ifdef BENCHMARK_LIB | |||||
::benchmark::Initialize(&argc, argv); | |||||
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) | |||||
return 1; | |||||
::benchmark::RunSpecifiedBenchmarks(); | |||||
#endif | |||||
// Extra Call to verify output of kernel | |||||
float(*image)[HEIGHT + 2][WIDTH + 2]; | |||||
image = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
initCheckboardImage((HEIGHT + 2), (WIDTH + 2), *image); | |||||
float(*imageOutput)[2 + HEIGHT][2 + WIDTH]; | |||||
imageOutput = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
float(*Ix)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Iy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Ixx)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Ixy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Iyy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Sxx)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Sxy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*Syy)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*det)[2 + HEIGHT][2 + WIDTH]; | |||||
float(*trace)[2 + HEIGHT][2 + WIDTH]; | |||||
Ix = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Iy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Ixx = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Ixy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Iyy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Sxx = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Sxy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
Syy = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
det = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
trace = (float(*)[2 + HEIGHT][2 + WIDTH]) | |||||
malloc(sizeof(float) * (2 + HEIGHT) * (2 + WIDTH)); | |||||
harrisKernel(HEIGHT, WIDTH, *image, *imageOutput, *Ix, *Iy, *Ixx, *Ixy, *Iyy, | |||||
*Sxx, *Sxy, *Syy, *det, *trace); | |||||
free((void *)Ix); | |||||
free((void *)Iy); | |||||
free((void *)Ixx); | |||||
free((void *)Ixy); | |||||
free((void *)Iyy); | |||||
free((void *)Sxx); | |||||
free((void *)Sxy); | |||||
free((void *)Syy); | |||||
free((void *)det); | |||||
free((void *)trace); | |||||
if (argc == 2) { | |||||
printImage(HEIGHT + 2, WIDTH + 2, *imageOutput, sum); | |||||
} else { | |||||
printImage(HEIGHT + 2, WIDTH + 2, *imageOutput, -1); | |||||
} | |||||
free((void *)image); | |||||
free((void *)imageOutput); | |||||
return 0; | |||||
} |
This will write output.txt into whichever directory lit is run in. You can set the working directory that the test will run in by passing an argument to llvm_test_run() > llvm_test_run(WORKDIR ${CMAKE_CURRENT_BINARY_DIR})