diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -725,6 +725,16 @@ return NewFormatInfo->Format; } +// Return the value in the inclusive range [Lo,Hi] that is aligned to the +// highest power of two. Note that the result is well defined for all inputs +// including corner cases like: +// - if Lo == Hi, return that value +// - if Lo == 0, return 0 (even though the "- 1" below underflows +// - if Lo > Hi, return 0 (as if the range wrapped around) +static unsigned mostAlignedValueInRange(unsigned Lo, unsigned Hi) { + return Hi & maskLeadingOnes(countLeadingZeros((Lo - 1) ^ Hi) + 1); +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, CombineInfo &Paired, @@ -799,22 +809,36 @@ } // Try to shift base address to decrease offsets. - unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); - CI.BaseOff = std::min(CI.Offset, Paired.Offset); + unsigned Min = std::min(EltOffset0, EltOffset1); + unsigned Max = std::max(EltOffset0, EltOffset1); - if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { + const unsigned Mask = maskTrailingOnes(8) * 64; + if (((Max - Min) & ~Mask) == 0) { if (Modify) { - CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; - Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + // From the range of values we could use for BaseOff, choose the one that + // is aligned to the highest power of two, to maximise the chance that + // the same offset can be reused for other load/store pairs. + unsigned BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); + // Copy the low bits of the offsets, so that when we adjust them by + // subtracting BaseOff they will be multiples of 64. + BaseOff |= Min & maskTrailingOnes(6); + CI.BaseOff = BaseOff * CI.EltSize; + CI.Offset = (EltOffset0 - BaseOff) / 64; + Paired.Offset = (EltOffset1 - BaseOff) / 64; CI.UseST64 = true; } return true; } - if (isUInt<8>(OffsetDiff)) { + if (isUInt<8>(Max - Min)) { if (Modify) { - CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; - Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + // From the range of values we could use for BaseOff, choose the one that + // is aligned to the highest power of two, to maximise the chance that + // the same offset can be reused for other load/store pairs. + unsigned BaseOff = mostAlignedValueInRange(Max - 0xff, Min); + CI.BaseOff = BaseOff * CI.EltSize; + CI.Offset = EltOffset0 - BaseOff; + Paired.Offset = EltOffset1 - BaseOff; } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -9,14 +9,14 @@ ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x200, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x800, [[BASE]] ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:72 offset1:172 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:144 offset1:244 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset0:88 offset1:188 define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = load float, float addrspace(3)* %arg, align 4 @@ -52,18 +52,14 @@ ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B4:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x640, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x6e0, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x780, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B4:v[0-9]+]], 0x820, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x400, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x800, [[BASE]] -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:20 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:20 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:20 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B4]] offset1:20 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:144 offset1:164 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:184 offset1:204 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:224 offset1:244 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:8 offset1:28 define amdgpu_kernel void @ds_read32_combine_stride_20(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 @@ -102,14 +98,14 @@ ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x200, [[BASE]] ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 -; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:88 offset1:188 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:144 offset1:244 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset0:72 offset1:172 define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 @@ -180,16 +176,11 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] - ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32 -; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32 -; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:64 offset1:96 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:128 offset1:160 define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2 @@ -219,12 +210,12 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250 -; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50 +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:44 offset1:94 define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { bb: %tmp = load double, double addrspace(3)* %arg, align 8 @@ -259,16 +250,11 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] - ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16 -; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16 -; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16 +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:32 offset1:48 +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:64 offset1:80 define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { bb: %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 @@ -301,14 +287,14 @@ ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x200, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x800, [[BASE]] ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:72 offset1:172 +; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset0:144 offset1:244 +; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset0:88 offset1:188 define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) { bb: store float 1.000000e+00, float addrspace(3)* %arg, align 4 @@ -337,14 +323,14 @@ ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x200, [[BASE]] +; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:88 offset1:188 +; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset0:144 offset1:244 +; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset0:72 offset1:172 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 @@ -396,17 +382,12 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] - -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]] +; VI-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], vcc, 4, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], 4, [[BASE]] -; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 -; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 -; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160 define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1 @@ -429,12 +410,12 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250 -; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 +; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:44 offset1:94 define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) { bb: store double 1.000000e+00, double addrspace(3)* %arg, align 8 @@ -459,17 +440,12 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] - -; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] -; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] +; VI-DAG: v_add_u32_e32 [[BASE]], vcc, 8, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[BASE]], 8, [[BASE]] -; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 -; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 -; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:32 offset1:48 +; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:64 offset1:80 define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) { bb: %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll --- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll +++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll @@ -14,12 +14,11 @@ ; GCN-NEXT: s_mov_b32 s1, 0x40100000 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v3, 0x840, v2 -; GCN-NEXT: v_add_u32_e32 v4, 0xc60, v2 +; GCN-NEXT: v_add_u32_e32 v3, 0x800, v2 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198 -; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset1:66 -; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74 +; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206 ; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -28,8 +27,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198 -; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset1:66 -; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74 +; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir --- a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir @@ -11,12 +11,12 @@ # VI: V_ADD_CO_U32_e64 %6, %0, # VI-NEXT: DS_WRITE2_B32 killed %7, %0, %3, 0, 8, # VI: V_ADD_CO_U32_e64 %10, %3, -# VI-NEXT: DS_READ2_B32 killed %11, 0, 8, +# VI-NEXT: DS_READ2_B32 killed %11, 16, 24, # GFX9: V_ADD_U32_e64 %6, %0, # GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0, %3, 0, 8, # GFX9: V_ADD_U32_e64 %9, %3, -# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8, +# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 16, 24, --- | @0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4 @@ -94,12 +94,12 @@ # VI: V_ADD_CO_U32_e64 %6, %0.sub0, # VI-NEXT: DS_WRITE2_B32 killed %7, %0.sub0, %3.sub0, 0, 8, # VI: V_ADD_CO_U32_e64 %10, %3.sub0, -# VI-NEXT: DS_READ2_B32 killed %11, 0, 8, +# VI-NEXT: DS_READ2_B32 killed %11, 16, 24, # GFX9: V_ADD_U32_e64 %6, %0.sub0, # GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0.sub0, %3.sub0, 0, 8, # GFX9: V_ADD_U32_e64 %9, %3.sub0, -# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8, +# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 16, 24, --- name: ds_combine_base_offset_subreg body: |