Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2159,13 +2159,6 @@ unsigned DstSize = getRegSizeInBits(*DstRC); unsigned NewSize = getRegSizeInBits(*NewRC); - // Do not allow coalescing between an odd and an even lanes as it will - // result in misaligned tuple access. - if (ST.hasGFX90AInsts() && !isSGPRClass(NewRC) && - (getChannelFromSubReg(DstSubReg) & 1) != - (getChannelFromSubReg(SubReg) & 1)) - return false; - // Do not increase size of registers beyond dword, we would need to allocate // adjacent registers and constraint regalloc more than needed. Index: llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll +++ llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll @@ -5,9 +5,9 @@ ; GCN-LABEL: {{^}}test_odd_int4: ; GCN: global_load_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v{{[0-9]+}}, s[{{[0-9:]+}}] -; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[HI:[13579]]], v{{[0-9]+}} -; GCN: global_store_dwordx2 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]], s[{{[0-9:]+}}] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} +; GCN-NEXT: global_store_dwordx2 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+\]}}, s[{{[0-9:]+}}] define amdgpu_kernel void @test_odd_int4(<4 x i32> addrspace(1)* %arg, <2 x i32> addrspace(1)* %arg1) { bb: Index: llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -439,23 +439,21 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: BB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5], s[0:1] glc scc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz BB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -663,9 +661,9 @@ ; GFX90A-LABEL: local_atomic_fadd_f64_rtn: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] main_body: Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -138,10 +138,9 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NEXT: BB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc @@ -149,7 +148,7 @@ ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz BB1_1 @@ -271,22 +270,21 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: BB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v1, v1, v[2:3], s[0:1] glc scc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz BB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll @@ -256,7 +256,7 @@ } ; GCN-LABEL: image_load_mmo -; GCN: image_load v1, v[4:5], s[0:7] dmask:0x1 unorm +; GCN: image_load v1, v[2:3], s[0:7] dmask:0x1 unorm define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)* %lds, <2 x i32> %c) #0 { store float 0.000000e+00, float addrspace(3)* %lds %c0 = extractelement <2 x i32> %c, i32 0