This combines adjacent loads if no writes to memory or atomic ops occur between them.
This was motivated by us not generating a bswap for:
static inline uint64_t LoadU64_x8( const uint8_t* pData ) { return (((uint64_t)pData[0])<<56)|(((uint64_t)pData[1])<<48)|(((uint64_t)pData[2])<<40)|(((uint64_t)pData[3])<<32)|(((uint64_t)pData[4])<<24)|(((uint64_t)pData[5])<<16)|(((uint64_t)pData[6])<<8)|((uint64_t)pData[7]); }