diff --git a/SingleSource/UnitTests/Vectorizer/common.h b/SingleSource/UnitTests/Vectorizer/common.h --- a/SingleSource/UnitTests/Vectorizer/common.h +++ b/SingleSource/UnitTests/Vectorizer/common.h @@ -17,6 +17,42 @@ _Pragma("clang loop vectorize(enable)") Loop \ }; +#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(InnerLoopCode) \ + auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \ + for (unsigned long i = 0; i < OuterTC; i++) { \ + _Pragma("clang loop vectorize(disable) interleave_count(1)") \ + for (unsigned long j = 0; j < InnerTC; j++) { \ + InnerLoopCode \ + } \ + } \ + }; \ + auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \ + for (unsigned long i = 0; i < OuterTC; i++) { \ + _Pragma("clang loop vectorize(enable)") \ + for (unsigned long j = 0; j < InnerTC; j++) { \ + InnerLoopCode \ + } \ + } \ + }; + +#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(InnerLoopCode) \ + auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \ + for (long i = OuterTC - 1; i >= 0; i--) { \ + _Pragma("clang loop vectorize(disable) interleave_count(1)") \ + for (unsigned long j = 0; j < InnerTC; j++) { \ + InnerLoopCode \ + } \ + } \ + }; \ + auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \ + for (long i = OuterTC - 1; i >= 0; i--) { \ + _Pragma("clang loop vectorize(enable)") \ + for (unsigned long j = 0; j < InnerTC; j++) { \ + InnerLoopCode \ + } \ + } \ + }; + static std::mt19937 rng; // Initialize arrays A with random numbers. diff --git a/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp b/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp --- a/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp +++ b/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp @@ -7,6 +7,7 @@ #include "common.h" + // Tests for memory runtime checks generated by the vectorizer. Runs scalar and // vectorized versions of a loop requiring runtime checks on the same inputs // with pointers to the same buffer using various offsets between reads and @@ -108,6 +109,51 @@ CheckWithOffsetSecond(i); } + + +template +using Fn4Ty = std::function; +template +static void checkOverlappingMemoryTwoRuntimeChecksNested(Fn4Ty ScalarFn, + Fn4Ty VectorFn, + const int OuterTC, + const int InnerTC, + const char *Name) { + std::cout << "Checking " << Name << "\n"; + + // Make sure we have enough extra elements so we can be liberal with offsets. + const unsigned NumArrayElements = (InnerTC * (OuterTC + 1)) * 8; + std::unique_ptr Input1(new Ty[NumArrayElements]); + std::unique_ptr Reference(new Ty[NumArrayElements]); + std::unique_ptr ToCheck(new Ty[NumArrayElements]); + + auto CheckWithOffsetSecond = [&](int Offset) { + init_data(Input1, NumArrayElements); + for (unsigned i = 0; i < NumArrayElements; i++) { + Reference[i] = Input1[i]; + ToCheck[i] = Input1[i]; + } + + // Run scalar function to generate reference output. + Ty *ReferenceStart = &Reference[NumArrayElements / 2]; + ScalarFn(ReferenceStart + Offset, ReferenceStart, OuterTC, InnerTC); + + // Run vector function to generate output to check. + Ty *StartPtr = &ToCheck[NumArrayElements / 2]; + callThroughOptnone(VectorFn, StartPtr + Offset, StartPtr, OuterTC, InnerTC); + + // Compare scalar and vector output. + check(Reference, ToCheck, NumArrayElements, Offset); + }; + + // With a nested loop, sometimes the runtime checks will fail and sometimes + // succeed. For example, with large offsets you'd expect for the first and + // last one or two executions of the inner loop there is no overlap. + for (int i = -(2 * (InnerTC + 1)); i <= (2 * (InnerTC + 1)); i++) + CheckWithOffsetSecond(i); +} + + int main(void) { rng = std::mt19937(15); @@ -261,5 +307,73 @@ ScalarFn, VectorFn, "1 read, 2 writes, simple indices, uint64_t"); } + { + DEFINE_NESTED_SCALAR_AND_VECTOR_FN4( + auto X = B[(i * OuterTC) + j]; + A[(i * (OuterTC + 1)) + j] = X; + ); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint8_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint32_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint64_t"); + + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint8_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint32_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint64_t"); + } + + { + DEFINE_NESTED_SCALAR_AND_VECTOR_FN4( + auto X = B[(i * OuterTC) + j]; + A[(i * (OuterTC + 1)) + j] += X; + ); + + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint8_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint32_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint64_t"); + + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint8_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint32_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint64_t"); + } + + { + DEFINE_NESTED_SCALAR_AND_VECTOR_FN5( + auto X = B[(i * OuterTC) + j]; + A[(i * (OuterTC + 1)) + j] = X; + ); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t"); + } + + { + DEFINE_NESTED_SCALAR_AND_VECTOR_FN5( + auto X = B[(i * OuterTC) + j]; + A[(i * (OuterTC + 1)) + j] += X; + ); + + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t"); + checkOverlappingMemoryTwoRuntimeChecksNested( + ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t"); + } + return 0; } diff --git a/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output b/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output --- a/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output +++ b/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output @@ -28,4 +28,22 @@ Checking 1 read, 2 writes, simple indices, uint8_t Checking 1 read, 2 writes, simple indices, uint32_t Checking 1 read, 2 writes, simple indices, uint64_t +Checking 1 read, 1 write, nested loop (matching trip counts), uint8_t +Checking 1 read, 1 write, nested loop (matching trip counts), uint32_t +Checking 1 read, 1 write, nested loop (matching trip counts), uint64_t +Checking 1 read, 1 write, nested loop (different trip counts), uint8_t +Checking 1 read, 1 write, nested loop (different trip counts), uint32_t +Checking 1 read, 1 write, nested loop (different trip counts), uint64_t +Checking 2 reads, 1 write, nested loop (matching trip counts), uint8_t +Checking 2 reads, 1 write, nested loop (matching trip counts), uint32_t +Checking 2 reads, 1 write, nested loop (matching trip counts), uint64_t +Checking 2 reads, 1 write, nested loop (different trip counts), uint8_t +Checking 2 reads, 1 write, nested loop (different trip counts), uint32_t +Checking 2 reads, 1 write, nested loop (different trip counts), uint64_t +Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t +Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t +Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t +Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t +Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t +Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t exit 0