Index: polly/trunk/lib/Transform/ScheduleOptimizer.cpp =================================================================== --- polly/trunk/lib/Transform/ScheduleOptimizer.cpp +++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp @@ -134,16 +134,33 @@ "instructions per clock cycle."), cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory)); -static cl::list - CacheLevelAssociativity("polly-target-cache-level-associativity", - cl::desc("The associativity of each cache level."), - cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, - cl::cat(PollyCategory)); - -static cl::list CacheLevelSizes( - "polly-target-cache-level-sizes", - cl::desc("The size of each cache level specified in bytes."), cl::Hidden, - cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); +// This option, along with --polly-target-2nd-cache-level-associativity, +// --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size +// represent the parameters of the target cache, which do not have typical +// values that can be used by default. However, to apply the pattern matching +// optimizations, we use the values of the parameters of Intel Core i7-3820 +// SandyBridge in case the parameters are not specified. Such an approach helps +// also to attain the high-performance on IBM POWER System S822 and IBM Power +// 730 Express server. +static cl::opt FirstCacheLevelAssociativity( + "polly-target-1st-cache-level-associativity", + cl::desc("The associativity of the first cache level."), cl::Hidden, + cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt SecondCacheLevelAssociativity( + "polly-target-2nd-cache-level-associativity", + cl::desc("The associativity of the second cache level."), cl::Hidden, + cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt FirstCacheLevelSize( + "polly-target-1st-cache-level-size", + cl::desc("The size of the first cache level specified in bytes."), + cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt SecondCacheLevelSize( + "polly-target-2nd-cache-level-size", + cl::desc("The size of the second level specified in bytes."), cl::Hidden, + cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory)); static cl::opt FirstLevelDefaultTileSize( "polly-default-tile-size", @@ -612,21 +629,20 @@ // degree of a cache level is greater than two. Otherwise, another algorithm // for determination of the parameters should be used. if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 && - CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 && - CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 && - CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2)) + FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 && + FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2)) return {1, 1, 1}; // The quotient should be greater than zero. if (PollyPatternMatchingNcQuotient <= 0) return {1, 1, 1}; int Car = floor( - (CacheLevelAssociativity[0] - 1) / + (FirstCacheLevelAssociativity - 1) / (1 + static_cast(MicroKernelParams.Nr) / MicroKernelParams.Mr)); - int Kc = (Car * CacheLevelSizes[0]) / - (MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8); - double Cac = static_cast(Kc * 8 * CacheLevelAssociativity[1]) / - CacheLevelSizes[1]; - int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac); + int Kc = (Car * FirstCacheLevelSize) / + (MicroKernelParams.Mr * FirstCacheLevelAssociativity * 8); + double Cac = static_cast(Kc * 8 * SecondCacheLevelAssociativity) / + SecondCacheLevelSize; + int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac); int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr; return {Mc, Nc, Kc}; } Index: polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll =================================================================== --- polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll +++ polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 -polly-optimized-scops < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-optimized-scops < %s 2>&1 | FileCheck %s ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++) Index: polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll =================================================================== --- polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 -polly-ast -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-ast -analyze < %s | FileCheck %s ; ; /* C := alpha*A*B + beta*C */ ; /* _PB_NK % Kc != 0 */ Index: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll =================================================================== --- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast < %s 2>&1 | FileCheck %s -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast -polly-target-1st-cache-level-size=0 < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++)