Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1304,9 +1304,9 @@ } else if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { unsigned OffsetValue0 = CAddr->getZExtValue() / Align; unsigned OffsetValue1 = OffsetValue0 + 1; - assert(Align * OffsetValue0 == CAddr->getZExtValue()); + bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue(); - if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) { + if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s ; FIXME: We don't get cases where the address was an SGPR because we ; get a copy to the address register for each one. @@ -317,7 +318,9 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-NOT: ds_read2_b32 +; CI-COUNT-4: ds_read_u8 +; GFX9-ALIGNED-4: ds_read_u8 +; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}} ; GCN: s_endpgm define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -336,7 +339,9 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-NOT: ds_read2_b32 +; CI-COUNT-2: ds_read_u16 +; GFX9-ALIGNED-2: ds_read_u16 +; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}} ; GCN: s_endpgm define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -655,6 +660,22 @@ ret <2 x float> %r1 } +@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 + +; GCN-LABEL: {{^}}read2_v2i32_align1_odd_offset: +; CI-COUNT-8: ds_read_u8 + +; GFX9-ALIGNED-COUNT-8: ds_read_u8 + +; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}} +; GFX9-UNALIGNED: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_ADDR]] offset1:1{{$}} +define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) { +entry: + %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 + store <2 x i32> %load, <2 x i32> addrspace(1)* %out + ret void +} + declare void @void_func_void() #3 declare i32 @llvm.amdgcn.workgroup.id.x() #1 Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 @@ -523,6 +524,21 @@ ret void } +@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 + +; GCN-LABEL: {{^}}write2_v2i32_align1_odd_offset: +; CI-COUNT-8: ds_write_b8 + +; GFX9-ALIGNED-COUNT-8: ds_write_b8 + +; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}} +; GFX9-UNALIGNED: ds_write2_b32 [[BASE_ADDR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { +entry: + store <2 x i32> , <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 + ret void +} + declare i32 @llvm.amdgcn.workgroup.id.x() #1 declare i32 @llvm.amdgcn.workgroup.id.y() #1 declare i32 @llvm.amdgcn.workitem.id.x() #1