Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1235,24 +1235,30 @@ SDValue &SOffset, SDValue &ImmOffset) const { SDLoc DL(Constant); + const uint32_t Align = 4; + const uint32_t MaxImm = alignDown(4095, Align); uint32_t Imm = cast(Constant)->getZExtValue(); uint32_t Overflow = 0; - if (Imm >= 4096) { - if (Imm <= 4095 + 64) { - // Use an SOffset inline constant for 1..64 - Overflow = Imm - 4095; - Imm = 4095; + if (Imm > MaxImm) { + if (Imm <= MaxImm + 64) { + // Use an SOffset inline constant for 4..64 + Overflow = Imm - MaxImm; + Imm = MaxImm; } else { // Try to keep the same value in SOffset for adjacent loads, so that // the corresponding register contents can be re-used. // - // Load values with all low-bits set into SOffset, so that a larger - // range of values can be covered using s_movk_i32 - uint32_t High = (Imm + 1) & ~4095; - uint32_t Low = (Imm + 1) & 4095; + // Load values with all low-bits (except for alignment bits) set into + // SOffset, so that a larger range of values can be covered using + // s_movk_i32. + // + // Atomic operations fail to work correctly when individual address + // components are unaligned, even if their sum is aligned. + uint32_t High = (Imm + Align) & ~4095; + uint32_t Low = (Imm + Align) & 4095; Imm = Low; - Overflow = High - 1; + Overflow = High - Align; } } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -3,7 +3,7 @@ ;CHECK-LABEL: {{^}}test1: ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc -;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff +;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) @@ -14,7 +14,7 @@ ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) ;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc -;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc +;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { @@ -71,24 +71,24 @@ ;CHECK-LABEL: {{^}}test3: ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc ;CHECK: s_waitcnt vmcnt(0) -;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff +;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc ;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc ;CHECK-DAG: s_waitcnt vmcnt(0) ;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc -;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc +;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) - %ofs.5 = add i32 %voffset, 42 + %ofs.5 = add i32 %voffset, 44 %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -27,20 +27,20 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c +;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x1038 ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen -;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095 -;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff -;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093 +;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092 +;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc +;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092 ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen -;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff -;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1 +;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc +;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: - %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0) - %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0) + %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0) + %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36856, i1 0, i1 0) %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0) %d.3 = fadd <4 x float> %d.0, %d.1 %data = fadd <4 x float> %d.2, %d.3 @@ -48,10 +48,10 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse: -;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff -;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65 +;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xffc +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:68 ;VI-NOT: s_mov -;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81 +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84 ;VI: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) { main_body: @@ -80,11 +80,11 @@ } ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: -;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58 +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: - %ofs = add i32 %1, 58 + %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) ret <4 x float> %data } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -18,18 +18,18 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs: -;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) ret <4 x float> %data } ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: ;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen -;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff -;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1 +;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc +;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: @@ -56,11 +56,11 @@ } ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: -;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58 +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: - %ofs = add i32 %1, 58 + %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) ret <4 x float> %data }