Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -307,7 +307,11 @@ def local_load : LocalLoad ; class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; + return cast(N)->getAlignment() >= 8; +}]>; + +class Aligned16Bytes : PatFrag (N)->getAlignment() >= 16; }]>; def local_load_aligned8bytes : Aligned8Bytes < Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -252,6 +252,11 @@ return (getGeneration() >= EVERGREEN); } + /// Returns whether target supports ds_read_b128/ds_write_b128. + bool hasDS128() const { + return CIInsts; + } + bool hasCaymanISA() const { return CaymanISA; } Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -444,17 +444,24 @@ // DS_GWS_SEMA_RELEASE_ALL // DS_WRAP_RTN_B32 // DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 // DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 let SubtargetPredicate = isCIVI in { def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">, AtomicNoRet<"ds_wrap_f32", 1>; +let mayStore = 0 in { +def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>; +def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>; +} // End mayStore = 0 + +let mayLoad = 0 in { +def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>; +def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>; +} // End mayLoad = 0 + + } // let SubtargetPredicate = isCIVI //===----------------------------------------------------------------------===// @@ -502,6 +509,9 @@ def : DSReadPat ; +// TODO: v3i32, also requires align 16 +def : DSReadPat ; + } // End AddedComplexity = 100 def : Pat < @@ -522,8 +532,8 @@ def : DSWritePat ; let AddedComplexity = 100 in { - def : DSWritePat ; +def : DSWritePat ; } // End AddedComplexity = 100 def : Pat < @@ -745,6 +755,10 @@ def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>; def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>; +def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>; +def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>; +def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>; +def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>; //===----------------------------------------------------------------------===// // VIInstructions.td @@ -905,3 +919,7 @@ def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>; def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>; def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>; +def DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>; +def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>; +def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>; +def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3087,6 +3087,10 @@ llvm_unreachable("unsupported private_element_size"); } case AMDGPUAS::LOCAL_ADDRESS: + // Use ds_read_b128 if possible. + if (NumElements == 4 && Subtarget->hasDS128() && Load->getAlignment() >= 16) + return SDValue(); + if (NumElements > 2) return SplitVectorLoad(Op, DAG); @@ -3494,6 +3498,11 @@ } } case AMDGPUAS::LOCAL_ADDRESS: { + // Use ds_write_b128 if possible. + if (NumElements == 4 && Subtarget->hasDS128() && + Store->getAlignment() >= 16) + return SDValue(); + if (NumElements > 2) return SplitVectorStore(Op, DAG); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -119,6 +119,10 @@ (ops node:$ptr), (si_load_local node:$ptr) >; +def si_load_local_align16 : Aligned16Bytes < + (ops node:$ptr), (si_load_local node:$ptr) +>; + def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ return cast(N)->getExtensionType() == ISD::SEXTLOAD; }]>; @@ -157,6 +161,10 @@ (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) >; +def si_store_local_align16 : Aligned16Bytes < + (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) +>; + def si_truncstore_local : PatFrag < (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ return cast(N)->isTruncatingStore(); Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -104,9 +104,9 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i - %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0 + %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i - store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep + store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep, align 8 ret void } @@ -120,9 +120,9 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i - %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0 + %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0, align 8 %out.gep = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %out, i32 %x.i - store <8 x float> %val0, <8 x float> addrspace(1)* %out.gep + store <8 x float> %val0, <8 x float> addrspace(1)* %out.gep, align 8 ret void } @@ -141,9 +141,9 @@ define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i - %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0 + %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0, align 8 %out.gep = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %out, i32 %x.i - store <16 x float> %val0, <16 x float> addrspace(1)* %out.gep + store <16 x float> %val0, <16 x float> addrspace(1)* %out.gep, align 8 ret void } Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -49,8 +49,9 @@ ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 ; SI-PROMOTE: ds_read_b64 -; CI-PROMOTE: ds_write2_b64 -; CI-PROMOTE: ds_read2_b64 + +; CI-PROMOTE: ds_write_b128 +; CI-PROMOTE: ds_read_b128 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 %array = alloca [4 x <2 x double>], align 16 @@ -107,8 +108,9 @@ ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 ; SI-PROMOTE: ds_read_b64 -; CI-PROMOTE: ds_write2_b64 -; CI-PROMOTE: ds_read2_b64 + +; CI-PROMOTE: ds_write_b128 +; CI-PROMOTE: ds_read_b128 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %array = alloca [4 x <2 x i64>], align 16 Index: test/CodeGen/AMDGPU/load-local-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f32.ll +++ test/CodeGen/AMDGPU/load-local-f32.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_f32_local: @@ -29,8 +29,12 @@ ; FIXME: should this do a read2_b64? ; FUNC-LABEL: {{^}}local_load_v3f32: -; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8 -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} + +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8 +; SI-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} + +; CIVI: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} + ; GCN: s_waitcnt ; GCN-DAG: ds_write_b64 ; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8{{$}} @@ -46,7 +50,8 @@ } ; FUNC-LABEL: {{^}}local_load_v4f32: -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -60,8 +65,11 @@ } ; FUNC-LABEL: {{^}}local_load_v8f32: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -79,10 +87,15 @@ } ; FUNC-LABEL: {{^}}local_load_v16f32: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f64.ll +++ test/CodeGen/AMDGPU/load-local-f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_f64: @@ -16,7 +16,8 @@ } ; FUNC-LABEL: {{^}}local_load_v2f64: -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -30,8 +31,10 @@ } ; FUNC-LABEL: {{^}}local_load_v3f64: -; GCN-DAG: ds_read2_b64 -; GCN-DAG: ds_read_b64 +; SI-DAG: ds_read2_b64 +; SI-DAG: ds_read_b64 + +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -47,8 +50,11 @@ } ; FUNC-LABEL: {{^}}local_load_v4f64: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -67,10 +73,15 @@ } ; FUNC-LABEL: {{^}}local_load_v8f64: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -96,14 +107,24 @@ } ; FUNC-LABEL: {{^}}local_load_v16f64: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,CIVI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i16: @@ -59,7 +59,8 @@ } ; FUNC-LABEL: {{^}}local_load_v8i16: -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; CIVI: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -73,9 +74,11 @@ } ; FUNC-LABEL: {{^}}local_load_v16i16: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -252,7 +255,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32: -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} + +; CIVI: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -266,7 +271,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32: -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} + +; CIVI: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -288,13 +295,22 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} + +; SI: ds_write2_b64 +; SI: ds_write2_b64 +; SI: ds_write2_b64 +; SI: ds_write2_b64 + + +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} -; GCN: ds_write2_b64 -; GCN: ds_write2_b64 -; GCN: ds_write2_b64 -; GCN: ds_write2_b64 +; CIVI: ds_write_b128 +; CIVI: ds_write_b128 +; CIVI: ds_write_b128 +; CIVI: ds_write_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -313,8 +329,11 @@ ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} + +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -348,10 +367,15 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 + +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:32{{$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:48{{$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -377,18 +401,32 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 + + +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:32 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:48 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:16 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:32 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:48 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:64 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:80 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:96 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:112 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -414,30 +452,59 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11 + +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 + + + + +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:32 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:48 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:80 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:96 +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:16 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:32 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:48 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:64 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:80 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:96 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:112 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:128 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:144 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:160 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:176 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:192 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:208 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:224 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:240 ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i32.ll +++ test/CodeGen/AMDGPU/load-local-i32.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -26,8 +26,10 @@ } ; FUNC-LABEL: {{^}}local_load_v3i32: -; GCN-DAG: ds_read_b64 -; GCN-DAG: ds_read_b32 +; SI-DAG: ds_read_b64 +; SI-DAG: ds_read_b32 + +; CIVI: ds_read_b128 define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in @@ -36,8 +38,8 @@ } ; FUNC-LABEL: {{^}}local_load_v4i32: -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} - +; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; CIVI: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in @@ -45,9 +47,34 @@ ret void } +; FUNC-LABEL: {{^}}local_load_v4i32_align4: +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define void @local_load_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +entry: + %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 + store <4 x i32> %ld, <4 x i32> addrspace(3)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}local_load_v4i32_align8: +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1{{$}} +define void @local_load_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +entry: + %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 + store <4 x i32> %ld, <4 x i32> addrspace(3)* %out, align 8 + ret void +} + ; FUNC-LABEL: {{^}}local_load_v8i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} + +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in @@ -56,14 +83,23 @@ } ; FUNC-LABEL: {{^}}local_load_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; SI-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 + +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:32{{$}} +; CIVI-DAG: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:48{{$}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:48{{$}} define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in Index: test/CodeGen/AMDGPU/load-local-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i64.ll +++ test/CodeGen/AMDGPU/load-local-i64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i64: @@ -16,7 +16,8 @@ } ; FUNC-LABEL: {{^}}local_load_v2i64: -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -30,8 +31,11 @@ } ; FUNC-LABEL: {{^}}local_load_v3i64: -; GCN-DAG: ds_read2_b64 -; GCN-DAG: ds_read_b64 +; SI-DAG: ds_read2_b64 +; SI-DAG: ds_read_b64 + +; CIVI: ds_read_b128 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -47,8 +51,11 @@ } ; FUNC-LABEL: {{^}}local_load_v4i64: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -67,10 +74,15 @@ } ; FUNC-LABEL: {{^}}local_load_v8i64: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -96,14 +108,25 @@ } ; FUNC-LABEL: {{^}}local_load_v16i64: -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 -; GCN: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + + +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 +; CIVI: ds_read_b128 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i8.ll +++ test/CodeGen/AMDGPU/load-local-i8.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,CIVI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -64,8 +64,11 @@ } ; FUNC-LABEL: {{^}}local_load_v16i8: -; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}} +; SI: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; SI: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}} + +; CIVI: ds_read_b128 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+$}} +; CIVI: ds_write_b128 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/local-64.ll =================================================================== --- test/CodeGen/AMDGPU/local-64.ll +++ test/CodeGen/AMDGPU/local-64.ll @@ -1,10 +1,10 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=CIVI %s -; BOTH-LABEL: {{^}}local_i32_load -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 -; BOTH: buffer_store_dword [[REG]], +; GCN-LABEL: {{^}}local_i32_load +; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 +; GCN: buffer_store_dword [[REG]], define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 %val = load i32, i32 addrspace(3)* %gep, align 4 @@ -12,19 +12,19 @@ ret void } -; BOTH-LABEL: {{^}}local_i32_load_0_offset -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} -; BOTH: buffer_store_dword [[REG]], +; GCN-LABEL: {{^}}local_i32_load_0_offset +; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} +; GCN: buffer_store_dword [[REG]], define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %val = load i32, i32 addrspace(3)* %in, align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset: -; BOTH-NOT: ADD -; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 -; BOTH: buffer_store_byte [[REG]], +; GCN-LABEL: {{^}}local_i8_load_i16_max_offset: +; GCN-NOT: add +; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 +; GCN: buffer_store_byte [[REG]], define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 %val = load i8, i8 addrspace(3)* %gep, align 4 @@ -32,14 +32,14 @@ ret void } -; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset: +; GCN-LABEL: {{^}}local_i8_load_over_i16_max_offset: ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on ; SI, which is why it is being OR'd with the base pointer. ; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] -; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] -; BOTH: buffer_store_byte [[REG]], +; CIVI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; GCN: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] +; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] +; GCN: buffer_store_byte [[REG]], define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 %val = load i8, i8 addrspace(3)* %gep, align 4 @@ -47,10 +47,10 @@ ret void } -; BOTH-LABEL: {{^}}local_i64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_i64_load: +; GCN-NOT: add +; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; GCN: buffer_store_dwordx2 [[REG]], define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 %val = load i64, i64 addrspace(3)* %gep, align 8 @@ -58,19 +58,19 @@ ret void } -; BOTH-LABEL: {{^}}local_i64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_i64_load_0_offset +; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN: buffer_store_dwordx2 [[REG]], define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %val = load i64, i64 addrspace(3)* %in, align 8 store i64 %val, i64 addrspace(1)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_f64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_f64_load: +; GCN-NOT: add +; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; GCN: buffer_store_dwordx2 [[REG]], define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %gep = getelementptr double, double addrspace(3)* %in, i32 7 %val = load double, double addrspace(3)* %gep, align 8 @@ -78,83 +78,91 @@ ret void } -; BOTH-LABEL: {{^}}local_f64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_f64_load_0_offset +; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN: buffer_store_dwordx2 [[REG]], define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %val = load double, double addrspace(3)* %in, align 8 store double %val, double addrspace(1)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_i64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +; GCN-LABEL: {{^}}local_i64_store: +; GCN-NOT: add +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 define void @local_i64_store(i64 addrspace(3)* %out) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 store i64 5678, i64 addrspace(3)* %gep, align 8 ret void } -; BOTH-LABEL: {{^}}local_i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; GCN-LABEL: {{^}}local_i64_store_0_offset: +; GCN-NOT: add +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { store i64 1234, i64 addrspace(3)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_f64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +; GCN-LABEL: {{^}}local_f64_store: +; GCN-NOT: add +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 define void @local_f64_store(double addrspace(3)* %out) nounwind { %gep = getelementptr double, double addrspace(3)* %out, i32 7 store double 16.0, double addrspace(3)* %gep, align 8 ret void } -; BOTH-LABEL: {{^}}local_f64_store_0_offset -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; GCN-LABEL: {{^}}local_f64_store_0_offset +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { store double 20.0, double addrspace(3)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_v2i64_store: -; BOTH-NOT: ADD -; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v2i64_store: +; GCN-NOT: add +; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; CIVI: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 +; GCN: s_endpgm define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 ret void } -; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v2i64_store_0_offset: +; GCN-NOT: add +; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 +; CIVI: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} +; GCN: s_endpgm define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 ret void } -; BOTH-LABEL: {{^}}local_v4i64_store: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v4i64_store: +; GCN-NOT: add +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 + +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 +; GCN: s_endpgm define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 ret void } -; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v4i64_store_0_offset: +; GCN-NOT: add +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 + +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16{{$}} +; GCN: s_endpgm define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 ret void Index: test/CodeGen/AMDGPU/reorder-stores.ll =================================================================== --- test/CodeGen/AMDGPU/reorder-stores.ll +++ test/CodeGen/AMDGPU/reorder-stores.ll @@ -8,10 +8,10 @@ ; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { - %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 - %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16 - store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 - store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16 + %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 8 + %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 8 + store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 8 + store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 8 ret void } @@ -20,10 +20,10 @@ ; SI: ds_write2_b64 ; SI: s_endpgm define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { - %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 - %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16 - store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 - store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16 + %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 8 + %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 8 + store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 8 + store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 8 ret void } Index: test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll =================================================================== --- test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll +++ test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll @@ -45,7 +45,7 @@ %mul.26.i = mul i32 %x.i.12.i, %x.i.i %add.i = add i32 %tmp2, %mul.26.i %arrayidx = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %add.i - store <4 x i64> zeroinitializer, <4 x i64> addrspace(3)* %arrayidx + store <4 x i64> zeroinitializer, <4 x i64> addrspace(3)* %arrayidx, align 8 %tmp12 = sext i32 %add.i to i64 %arrayidx1 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %srcValues, i64 %tmp12 %tmp13 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx1 @@ -55,7 +55,7 @@ %mul.i = shl i32 %tmp14, 2 %arrayidx.i = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr, i32 %mul.i %tmp15 = bitcast i64 addrspace(3)* %arrayidx.i to <4 x i64> addrspace(3)* - store <4 x i64> %tmp13, <4 x i64> addrspace(3)* %tmp15 + store <4 x i64> %tmp13, <4 x i64> addrspace(3)* %tmp15, align 8 %add.ptr6 = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %tmp14, i32 %alignmentOffset %tmp16 = sext i32 %tmp14 to i64 %tmp17 = sext i32 %alignmentOffset to i64 @@ -64,15 +64,15 @@ %trunc = trunc i256 %tmp18 to i64 store i64 %trunc, i64 addrspace(1)* %add.ptr9 %arrayidx10.1 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 1 - %tmp19 = load i64, i64 addrspace(3)* %arrayidx10.1 + %tmp19 = load i64, i64 addrspace(3)* %arrayidx10.1, align 8 %arrayidx11.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 1 - store i64 %tmp19, i64 addrspace(1)* %arrayidx11.1 + store i64 %tmp19, i64 addrspace(1)* %arrayidx11.1, align 8 %arrayidx10.2 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 2 - %tmp20 = load i64, i64 addrspace(3)* %arrayidx10.2 + %tmp20 = load i64, i64 addrspace(3)* %arrayidx10.2, align 8 %arrayidx11.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 2 store i64 %tmp20, i64 addrspace(1)* %arrayidx11.2 %arrayidx10.3 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 3 - %tmp21 = load i64, i64 addrspace(3)* %arrayidx10.3 + %tmp21 = load i64, i64 addrspace(3)* %arrayidx10.3, align 8 %arrayidx11.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 3 store i64 %tmp21, i64 addrspace(1)* %arrayidx11.3 ret void Index: test/CodeGen/AMDGPU/store-local.ll =================================================================== --- test/CodeGen/AMDGPU/store-local.ll +++ test/CodeGen/AMDGPU/store-local.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s @@ -128,7 +128,8 @@ ; CM: LDS_WRITE ; CM: LDS_WRITE -; GCN: ds_write2_b64 +; SI: ds_write2_b64 +; CIVI: ds_write_b128 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out Index: test/CodeGen/AMDGPU/store-v3i64.ll =================================================================== --- test/CodeGen/AMDGPU/store-v3i64.ll +++ test/CodeGen/AMDGPU/store-v3i64.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; GCN-LABEL: {{^}}global_store_v3i64: ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 @@ -46,8 +46,11 @@ } ; GCN-LABEL: {{^}}local_store_v3i64: -; GCN: ds_write2_b64 -; GCN: ds_write_b64 +; SI: ds_write2_b64 +; SI: ds_write_b64 + +; CIVI-DAG: ds_write_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +; CIVI-DAG: ds_write_b128 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}} define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 ret void Index: test/MC/AMDGPU/ds.s =================================================================== --- test/MC/AMDGPU/ds.s +++ test/MC/AMDGPU/ds.s @@ -468,3 +468,12 @@ // SICI: ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xe0,0xd9,0x02,0x00,0x00,0x08] // VI: ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xf0,0xd8,0x02,0x00,0x00,0x08] +ds_read_b128 v[8:11], v2 +// NOSI: error: instruction not supported on this GPU +// CI: ds_read_b128 v[8:11], v2 ; encoding: [0x00,0x00,0xfc,0xdb,0x02,0x00,0x00,0x08] +// VI: ds_read_b128 v[8:11], v2 ; encoding: [0x00,0x00,0xfe,0xd9,0x02,0x00,0x00,0x08] + +ds_write_b128 v2, v[4:7] +// NOSI: error: instruction not supported on this GPU +// CI: ds_write_b128 v2, v[4:7] ; encoding: [0x00,0x00,0x7c,0xdb,0x02,0x04,0x00,0x00] +// VI: ds_write_b128 v2, v[4:7] ; encoding: [0x00,0x00,0xbe,0xd9,0x02,0x04,0x00,0x00]