diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1267,8 +1267,9 @@ ConstantSDNode *C1 = cast(N1); unsigned OffsetValue0 = C1->getZExtValue() / Align; unsigned OffsetValue1 = OffsetValue0 + 1; + bool OffsetIsAligned = Align * OffsetValue0 == C1->getZExtValue(); // (add n0, c0) - if (isDSOffsetLegal(N0, OffsetValue1, 8)) { + if (isDSOffsetLegal(N0, OffsetValue1, 8) && OffsetIsAligned) { Base = N0; Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8); Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8); @@ -1280,8 +1281,9 @@ dyn_cast(Addr.getOperand(0))) { unsigned OffsetValue0 = C->getZExtValue() / Align; unsigned OffsetValue1 = OffsetValue0 + 1; + bool OffsetIsAligned = Align * OffsetValue0 == C->getZExtValue(); - if (isUInt<8>(OffsetValue0)) { + if (isUInt<8>(OffsetValue0) && OffsetIsAligned) { SDLoc DL(Addr); SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -194,20 +194,20 @@ ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; CI: ; %bb.0: ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:254 offset1:255 +; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:254 offset1:255 +; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i @@ -223,7 +223,7 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[0:1], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: s_mov_b64 vcc, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,7 +235,7 @@ ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s1, s0 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:254 offset1:255 +; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -244,13 +244,13 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset0:254 offset1:255 +; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -713,8 +713,8 @@ ; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:2 +; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s0, v2, 5 +; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -725,9 +725,11 @@ ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset0:1 offset1:2 -; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset0:2 offset1:3 +; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 +; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i