Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -248,6 +248,10 @@ return cast(N)->getAlignment() % 8 == 0; }]>; +class Aligned16Bytes : PatFrag (N)->getAlignment() % 16 == 0; +}]>; + class LoadFrag : PatFrag<(ops node:$ptr), (op node:$ptr)>; class StoreFrag : PatFrag < @@ -371,6 +375,10 @@ (ops node:$ptr), (load_local node:$ptr) >; +def load_align16_local : Aligned16Bytes < + (ops node:$ptr), (load_local node:$ptr) +>; + def store_align8_local : Aligned8Bytes < (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) >; Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -241,8 +241,13 @@ AddrSpace == AS.CONSTANT_ADDRESS_32BIT || AddrSpace == AS.FLAT_ADDRESS) return 128; - if (AddrSpace == AS.LOCAL_ADDRESS || - AddrSpace == AS.REGION_ADDRESS) + + if (AddrSpace == AS.LOCAL_ADDRESS) { + if (ST->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return 128; + return 64; + } + if (AddrSpace == AS.REGION_ADDRESS) return 64; if (AddrSpace == AS.PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -650,6 +650,8 @@ defm : DSReadPat_mc ; +defm : DSReadPat_mc ; + } // End AddedComplexity = 100 let OtherPredicates = [HasD16LoadStore] in { Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2251,6 +2251,10 @@ } } +static bool isAligned16(unsigned Alignment) { + return Alignment % 16 == 0; +} + bool SITargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, const SmallVectorImpl &Outs, @@ -5419,14 +5423,13 @@ llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + unsigned Alignment = Load->getAlignment(); + if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS && + isAligned16(Alignment) && MemVT.getStoreSize() == 16) + return SDValue(); + if (NumElements > 2) return SplitVectorLoad(Op, DAG); - - if (NumElements == 2) - return SDValue(); - - // If properly aligned, if we split we might be able to use ds_read_b64. - return SplitVectorLoad(Op, DAG); } return SDValue(); } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -410,6 +410,9 @@ def load_glue_align8 : Aligned8Bytes < (ops node:$ptr), (load_glue node:$ptr) >; +def load_glue_align16 : Aligned16Bytes < + (ops node:$ptr), (load_glue node:$ptr) +>; def load_local_m0 : LoadFrag, LocalAddress; @@ -418,6 +421,7 @@ def az_extloadi8_local_m0 : LoadFrag, LocalAddress; def az_extloadi16_local_m0 : LoadFrag, LocalAddress; def load_align8_local_m0 : LoadFrag , LocalAddress; +def load_align16_local_m0 : LoadFrag , LocalAddress; def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore, Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -98,7 +98,7 @@ } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg: -; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read_b128 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; CI: buffer_store_dwordx4 [[REG_ZW]] ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { @@ -112,8 +112,8 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v8f32_superreg: -; CI-DAG: ds_read2_b64 [[VEC_HI:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-DAG: ds_read2_b64 [[VEC_LO:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read_b128 [[VEC_HI:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset:16 +; CI-DAG: ds_read_b128 [[VEC_LO:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}} ; CI: s_endpgm @@ -128,10 +128,10 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: -; CI-DAG: ds_read2_b64 [[VEC0_3:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} -; CI-DAG: ds_read2_b64 [[VEC4_7:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-DAG: ds_read2_b64 [[VEC8_11:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:4 offset1:5{{$}} -; CI-DAG: ds_read2_b64 [[VEC12_15:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:6 offset1:7{{$}} +; CI-DAG: ds_read_b128 [[VEC0_3:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; CI-DAG: ds_read_b128 [[VEC4_7:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset:16 +; CI-DAG: ds_read_b128 [[VEC8_11:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset:32 +; CI-DAG: ds_read_b128 [[VEC12_15:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset:48 ; CI: s_waitcnt lgkmcnt(0) ; CI-DAG: buffer_store_dwordx4 [[VEC0_3]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}} ; CI-DAG: buffer_store_dwordx4 [[VEC4_7]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -50,7 +50,7 @@ ; SI-PROMOTE: ds_read_b64 ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write2_b64 -; CI-PROMOTE: ds_read2_b64 +; CI-PROMOTE: ds_read_b128 define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 %array = alloca [4 x <2 x double>], align 16, addrspace(5) @@ -108,7 +108,7 @@ ; SI-PROMOTE: ds_read_b64 ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write2_b64 -; CI-PROMOTE: ds_read2_b64 +; CI-PROMOTE: ds_read_b128 define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %array = alloca [4 x <2 x i64>], align 16, addrspace(5) Index: test/CodeGen/AMDGPU/load-local-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f32.ll +++ test/CodeGen/AMDGPU/load-local-f32.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GCN-DEF %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GFX8 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}load_f32_local: @@ -46,7 +46,7 @@ ; EG: LDS_READ_RET define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 { entry: - %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in + %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in, align 8 store <3 x float> %tmp0, <3 x float> addrspace(3)* %out ret void } @@ -55,7 +55,8 @@ ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GFX8: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -72,8 +73,10 @@ ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GFX8: ds_read_b128 +; GFX8: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -94,10 +97,15 @@ ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 + +; GFX8: ds_read_b128 +; GFX8: ds_read_b128 +; GFX8: ds_read_b128 +; GFX8: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f64.ll +++ test/CodeGen/AMDGPU/load-local-f64.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GCN-DEF %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GFX789 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GFX789 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC,GFX789 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}local_load_f64: @@ -23,7 +23,7 @@ ; SICIV: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -51,7 +51,7 @@ ; EG: LDS_READ_RET define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 { entry: - %ld = load <3 x double>, <3 x double> addrspace(3)* %in + %ld = load <3 x double>, <3 x double> addrspace(3)* %in, align 8 store <3 x double> %ld, <3 x double> addrspace(3)* %out ret void } @@ -60,8 +60,10 @@ ; SICIV: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -83,10 +85,15 @@ ; SICIV: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 + +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -115,14 +122,23 @@ ; SICIV: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 + +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 +; GFX789: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC,GCN-LOAD %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,GFX89,FUNC %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -75,7 +75,8 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -92,9 +93,10 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} - +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -300,7 +302,8 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -317,7 +320,8 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -342,8 +346,10 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 ; GCN: ds_write2_b64 ; GCN: ds_write2_b64 @@ -370,8 +376,10 @@ ; SICIVI: s_mov_b32 m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -408,10 +416,14 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:32 +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:48 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -440,10 +452,14 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:32 +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:48 +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 @@ -480,14 +496,23 @@ ; GFX9-NOT: m0 ; SICIVI: s_mov_b32 m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11 +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN-LOAD: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} + ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27 Index: test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i32.ll +++ test/CodeGen/AMDGPU/load-local-i32.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GCN-DEF %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC,GFX89 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i32: @@ -37,8 +37,8 @@ ; GCN-DAG: ds_read_b32 define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { entry: - %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in - store <3 x i32> %ld, <3 x i32> addrspace(3)* %out + %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 + store <3 x i32> %ld, <3 x i32> addrspace(3)* %out, align 8 ret void } @@ -46,7 +46,8 @@ ; SICIVI: s_mov_b32 m0, -1 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DEF: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { entry: @@ -59,8 +60,10 @@ ; SICIVI: s_mov_b32 m0, -1 ; GFX9-NOT: m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DEF: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DEF: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in @@ -72,10 +75,14 @@ ; SICIVI: s_mov_b32 m0, -1 ; GFX9-NOT: m0 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DEF: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} +; GCN-DEF: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} +; GCN-DEF: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DEF: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:32 +; GFX89: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:48 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 Index: test/CodeGen/AMDGPU/load-local-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i64.ll +++ test/CodeGen/AMDGPU/load-local-i64.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GCN-DEF %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GFX789 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC,GFX789 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC,GFX789 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}local_load_i64: @@ -23,7 +23,8 @@ ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GFX789: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -51,7 +52,7 @@ ; EG: LDS_READ_RET define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 { entry: - %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in + %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in, align 8 store <3 x i64> %ld, <3 x i64> addrspace(3)* %out ret void } @@ -60,8 +61,10 @@ ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -83,10 +86,14 @@ ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -115,14 +122,23 @@ ; SICIVI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 +; GCN-DEF: ds_read2_b64 + +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 +; GFX89: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i8.ll +++ test/CodeGen/AMDGPU/load-local-i8.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC,GCN-LOAD %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC,GFX8 %s ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s ; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -71,7 +71,9 @@ ; FUNC-LABEL: {{^}}local_load_v16i8: ; GFX9-NOT: m0 -; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-LOAD: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; GFX8: ds_read_b128 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} +; GFX9: ds_read_b128 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} ; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}} ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/reorder-stores.ll =================================================================== --- test/CodeGen/AMDGPU/reorder-stores.ll +++ test/CodeGen/AMDGPU/reorder-stores.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefixes=SI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=SI,GCN-SEA %s ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: ; SI: buffer_load_dwordx4 @@ -16,7 +16,8 @@ } ; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store: -; SI: ds_read2_b64 +; GCN: ds_read2_b64 +; GCN-SEA: ds_read_b128 ; SI: ds_write2_b64 ; SI: s_endpgm define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {