Index: lib/Target/R600/AMDGPUInstrInfo.cpp =================================================================== --- lib/Target/R600/AMDGPUInstrInfo.cpp +++ lib/Target/R600/AMDGPUInstrInfo.cpp @@ -218,15 +218,26 @@ return true; } -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const { - assert(Offset2 > Offset1 - && "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 16, - // then schedule together. - // TODO: Make the loads schedule near if it fits in a cacheline - return (NumLoads < 16 && (Offset2 - Offset1) < 16); +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } bool Index: test/CodeGen/R600/llvm.memcpy.ll =================================================================== --- test/CodeGen/R600/llvm.memcpy.ll +++ test/CodeGen/R600/llvm.memcpy.ll @@ -15,17 +15,18 @@ ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 + ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 - ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 + ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 @@ -35,9 +36,8 @@ ; SI: DS_READ_U8 ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 + ; SI: DS_READ_U8 ; SI: DS_READ_U8 @@ -47,6 +47,7 @@ ; SI: DS_READ_U8 ; SI: DS_READ_U8 ; SI: DS_READ_U8 + ; SI: DS_READ_U8 ; SI: DS_READ_U8 ; SI: DS_READ_U8 @@ -65,6 +66,9 @@ ; SI: DS_WRITE_B8 ; SI: DS_WRITE_B8 ; SI: DS_WRITE_B8 + +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 ; SI: DS_WRITE_B8 ; SI: DS_WRITE_B8 ; SI: DS_WRITE_B8 @@ -83,21 +87,13 @@ ; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align2 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 ; SI: DS_READ_U16 @@ -117,6 +113,15 @@ ; SI: DS_WRITE_B16 ; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 + ; SI: S_ENDPGM define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* @@ -278,37 +283,37 @@ ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align2 ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT - ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT -; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_LOAD_USHORT + +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_STORE_SHORT ; SI-DAG: BUFFER_STORE_SHORT ; SI: S_ENDPGM @@ -321,9 +326,9 @@ ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align4 ; SI: BUFFER_LOAD_DWORDX4 -; SI: BUFFER_STORE_DWORDX4 ; SI: BUFFER_LOAD_DWORDX4 ; SI: BUFFER_STORE_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 ; SI: S_ENDPGM define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* @@ -334,9 +339,9 @@ ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align8 ; SI: BUFFER_LOAD_DWORDX4 -; SI: BUFFER_STORE_DWORDX4 ; SI: BUFFER_LOAD_DWORDX4 ; SI: BUFFER_STORE_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 ; SI: S_ENDPGM define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* @@ -347,9 +352,9 @@ ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align16 ; SI: BUFFER_LOAD_DWORDX4 -; SI: BUFFER_STORE_DWORDX4 ; SI: BUFFER_LOAD_DWORDX4 ; SI: BUFFER_STORE_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 ; SI: S_ENDPGM define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*