// common multiple chains for the load/stores with same offsets in the loop,
// so that we can reuse the offsets and reduce the register pressure in the
// loop. This transformation can also increase the loop ILP as now each chain
// uses its own loop induction add/addi. But this will increase the number of
// add/addi in the loop.
//
// char *p;
// A1 = p + base1
// A2 = p + base1 + offset
// B1 = p + base2
// B2 = p + base2 + offset
//
// for (int i = 0; i < n; i++)
// unsigned long x1 = *(unsigned long *)(A1 + i);
// unsigned long x2 = *(unsigned long *)(A2 + i)
// unsigned long x3 = *(unsigned long *)(B1 + i);
// unsigned long x4 = *(unsigned long *)(B2 + i);
// }
//
// to look like this:
//
// A1_new = p + base1 // chain 1
// B1_new = p + base2 // chain 2, now inside the loop, common offset is
// // reused.
//
// for (long long i = 0; i < n; i+=count) {
// unsigned long x1 = *(unsigned long *)(A1_new + i);
// unsigned long x2 = *(unsigned long *)((A1_new + i) + offset);
// unsigned long x3 = *(unsigned long *)(B1_new + i);
// unsigned long x4 = *(unsigned long *)((B1_new + i) + offset);
// }Found some improvements for our internal benchmarks.
Can we avoid using ' in examples?