// common multiple chains for the load/stores with same offsets in the loop, // so that we can reuse the offsets and reduce the register pressure in the // loop. This transformation can also increase the loop ILP as now each chain // uses its own loop induction add/addi. But this will increase the number of // add/addi in the loop. // // char *p; // A1 = p + base1 // A2 = p + base1 + offset // B1 = p + base2 // B2 = p + base2 + offset // // for (int i = 0; i < n; i++) // unsigned long x1 = *(unsigned long *)(A1 + i); // unsigned long x2 = *(unsigned long *)(A2 + i) // unsigned long x3 = *(unsigned long *)(B1 + i); // unsigned long x4 = *(unsigned long *)(B2 + i); // } // // to look like this: // // A1_new = p + base1 // chain 1 // B1_new = p + base2 // chain 2, now inside the loop, common offset is // // reused. // // for (long long i = 0; i < n; i+=count) { // unsigned long x1 = *(unsigned long *)(A1_new + i); // unsigned long x2 = *(unsigned long *)((A1_new + i) + offset); // unsigned long x3 = *(unsigned long *)(B1_new + i); // unsigned long x4 = *(unsigned long *)((B1_new + i) + offset); // }
Found some improvements for our internal benchmarks.
Can we avoid using ' in examples?