Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -105,6 +105,7 @@ bool isConstantLoad(const MemSDNode *N, int cbID) const; bool isUniformBr(const SDNode *N) const; + bool is8ByteAligned(const MemSDNode *N) const; SDNode *glueCopyToM0(SDNode *N) const; @@ -650,6 +651,25 @@ Term->getMetadata("structurizecfg.uniform"); } +bool AMDGPUDAGToDAGISel::is8ByteAligned(const MemSDNode *N) const { + if ((N->getAlignment() & 7) == 0) + return true; + + if (N->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS) + return false; + + // LDS space is allocated to a work-group or wavefront in contiguous blocks of + // 64 Dwords on 64-Dword alignment; checking offset being multiple of 8 is + // sufficient to prove that the address is 8 byte aligned. + if ((N->getSrcValueOffset() & 7) != 0) + return false; + + SDValue Offset = N->getBasePtr(); + KnownBits Known; + CurDAG->computeKnownBits(Offset, Known); + return Known.countMinTrailingZeros() >= Log2_32(8); +} + StringRef AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -245,7 +245,7 @@ //===----------------------------------------------------------------------===// class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; + return is8ByteAligned(cast(N)); }]>; class LoadFrag : PatFrag<(ops node:$ptr), (op node:$ptr)>; Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -442,7 +442,7 @@ ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -472,8 +472,7 @@ ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -488,10 +487,9 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; GCN-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], 0 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, [[BASE]] offset:16384 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, [[BASE]] offset:32760 ; GCN: s_endpgm define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 @@ -650,6 +648,60 @@ ret <2 x float> %r1 } +@localMemory = internal unnamed_addr addrspace(3) constant [819 x float] undef, align 4 +; Function Attrs: nounwind readnone +declare i64 @hc_get_workitem_id(i32) + +; lowerload() should not adjust alignment; +; GCN-LABEL: load_v2f32_local_unaligned8: +; GCN: ds_read2_b32 +; GCN: ds_read2_b32 +define amdgpu_kernel void @load_v2f32_local_unaligned8(float addrspace(1)* %out) { +entry: + %call = tail call i64 @hc_get_workitem_id(i32 0) + %conv = trunc i64 %call to i32 + %index = and i32 %conv, 7 + %localPtr = getelementptr inbounds [819 x float], [819 x float] addrspace(3)* @localMemory, i32 0, i32 %index + %elt1 = load float, float addrspace(3)* %localPtr, align 4 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %localPtr, i32 1 + %elt2 = load float, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %localPtr, i32 2 + %elt3 = load float, float addrspace(3)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float, float addrspace(3)* %localPtr, i32 3 + %elt4 = load float, float addrspace(3)* %arrayidx3, align 4 + %mul1 = fmul float %elt1, %elt2 + %mul2 = fmul float %elt3, %elt4 + %add = fadd float %mul1, %mul2 + %arrayidx11 = getelementptr inbounds float, float addrspace(1)* %out, i64 0 + store float %add, float addrspace(1)* %arrayidx11, align 4 + ret void +} + +; load address is 8 byte aligned; lowerload() should not adjust alignment. +; GCN-LABEL: load_v2f32_local_aligned8: +; GCN: ds_read2_b64 +define amdgpu_kernel void @load_v2f32_local_aligned8(float addrspace(1)* %out) { +entry: + %call = tail call i64 @hc_get_workitem_id(i32 0) + %conv = trunc i64 %call to i32 + %rem = and i32 %conv, 7 + %index = shl nuw nsw i32 %rem, 3 + %localPtr = getelementptr inbounds [819 x float], [819 x float] addrspace(3)* @localMemory, i32 0, i32 %index + %elt1 = load float, float addrspace(3)* %localPtr, align 4 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %localPtr, i32 1 + %elt2 = load float, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %localPtr, i32 2 + %elt3 = load float, float addrspace(3)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float, float addrspace(3)* %localPtr, i32 3 + %elt4 = load float, float addrspace(3)* %arrayidx3, align 4 + %mul1 = fmul float %elt1, %elt2 + %mul2 = fmul float %elt3, %elt4 + %add = fadd float %mul1, %mul2 + %arrayidx11 = getelementptr inbounds float, float addrspace(1)* %out, i64 0 + store float %add, float addrspace(1)* %arrayidx11, align 4 + ret void +} + declare void @void_func_void() #3 declare i32 @llvm.amdgcn.workgroup.id.x() #1 Index: test/CodeGen/AMDGPU/ds_read2_offset_order.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -7,7 +7,7 @@ ; SI-LABEL: {{^}}offset_order: ; SI-DAG: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}} -; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 +; SI-DAG: ds_read_b64 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:8 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12 define amdgpu_kernel void @offset_order(float addrspace(1)* %out) { Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -8,7 +8,7 @@ @lds.v16 = addrspace(3) global [512 x <16 x float>] undef, align 4 ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_align4: -; CI: ds_read2_b32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI: ds_read_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dwordx2 [[RESULT]] ; CI: s_endpgm @@ -36,10 +36,9 @@ } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[REG_W]] +; CI-DAG: ds_read2_b64 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v{{[0-9]+}}, v[[REG_Y]] ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm @@ -62,7 +61,7 @@ } ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read_b64 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]] ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]] Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -390,7 +390,7 @@ ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN: ds_write_b64 [[ZERO]], {{v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -417,8 +417,7 @@ ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 [[ZERO]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 @@ -432,10 +431,8 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16384 +; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:32760 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4