diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1688,33 +1688,27 @@ } else { // If the offset doesn't fit, put the low bits into the offset field and // add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. SDLoc DL(N); - uint64_t ImmField; + uint64_t RemainderOffset = COffsetVal; + uint64_t ImmField = 0; const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); if (IsSigned) { - ImmField = SignExtend64(COffsetVal, NumBits); - - // Don't use a negative offset field if the base offset is positive. - // Since the scheduler currently relies on the offset field, doing so - // could result in strange scheduling decisions. - - // TODO: Should we not do this in the opposite direction as well? - if (static_cast(COffsetVal) > 0) { - if (static_cast(ImmField) < 0) { - const uint64_t OffsetMask = - maskTrailingOnes(NumBits - 1); - ImmField = COffsetVal & OffsetMask; - } - } - } else { - // TODO: Should we do this for a negative offset? - const uint64_t OffsetMask = maskTrailingOnes(NumBits); - ImmField = COffsetVal & OffsetMask; + // Use signed division by a power of two to truncate towards 0. + int64_t D = 1LL << (NumBits - 1); + RemainderOffset = (static_cast(COffsetVal) / D) * D; + ImmField = COffsetVal - RemainderOffset; + } else if (static_cast(COffsetVal) >= 0) { + ImmField = COffsetVal & maskTrailingOnes(NumBits); + RemainderOffset = COffsetVal - ImmField; } - - uint64_t RemainderOffset = COffsetVal - ImmField; - assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); assert(RemainderOffset + ImmField == COffsetVal); diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -191,9 +191,9 @@ ; CHECK-LABEL: {{^}}store_flat_i8_neg_offset: ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} -; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, -; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4094{{$}} +; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 store volatile i8 %x, i8* %fptr.offset @@ -220,9 +220,9 @@ ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset: ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, -; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4094{{$}} +; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 %val = load volatile i8, i8* %fptr.offset diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -103,9 +103,9 @@ ; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -483,10 +483,10 @@ ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -510,10 +510,10 @@ ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -537,10 +537,10 @@ ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -591,10 +591,10 @@ ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -773,9 +773,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1269,10 +1269,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1303,10 +1304,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1337,10 +1339,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1408,9 +1411,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -471,10 +471,10 @@ ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -482,10 +482,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 @@ -498,10 +498,10 @@ ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -509,10 +509,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 @@ -525,10 +525,10 @@ ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -552,10 +552,10 @@ ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1213,10 +1213,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1227,9 +1228,9 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1246,10 +1247,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1260,9 +1262,9 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1279,10 +1281,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1314,9 +1317,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -87,18 +87,17 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -107,6 +106,7 @@ ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: @@ -519,11 +519,11 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) #2 %conv = and i64 %call, 255 diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -314,13 +314,13 @@ ; GFX803: v_add{{(_co)?}}_{{i|u}}32_e32 ; GFX803: v_addc_u32_e32 -; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v ; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v ; GFX906-DAG: v_lshrrev_b32_e32 -; GFX906: flat_store_short v[0:1], v2 offset:2050{{$}} +; GFX906: flat_store_short v[0:1], v2{{$}} -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:2050{{$}} +; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; GFX803: flat_store_short v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -363,13 +363,13 @@ ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 -; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff001, v ; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc -; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:1{{$}} +; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} ; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906: flat_store_byte v[0:1], v2 offset:1{{$}} +; GFX906: flat_store_byte v[0:1], v2{{$}} ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v2{{$}}