Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -226,6 +226,15 @@ Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; } + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + + if (NewOffset0 > NewOffset1) { + // Canonicalize the merged instruction so the smaller offset comes first. + std::swap(NewOffset0, NewOffset1); + std::swap(SubRegIdx0, SubRegIdx1); + } + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); @@ -246,9 +255,6 @@ .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; - const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. @@ -322,6 +328,12 @@ Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; } + if (NewOffset0 > NewOffset1) { + // Canonicalize the merged instruction so the smaller offset comes first. + std::swap(NewOffset0, NewOffset1); + std::swap(Data0, Data1); + } + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); Index: llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -9,7 +9,7 @@ ; SI-LABEL: {{^}}offset_order: ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 define void @offset_order(float addrspace(1)* %out) { Index: llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -44,9 +44,9 @@ } ; SI-LABEL: @simple_read2st64_f32_max_offset -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:255 offset1:1 +; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { @@ -176,9 +176,9 @@ ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff ; SI-LABEL: @simple_read2st64_f64_max_offset -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:127 offset1:4 +; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}, v{{\[}}[[LO_VREG]]:{{[0-9]+\]}} +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-local-i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-local-i16.ll @@ -51,7 +51,7 @@ } ; FUNC-LABEL: {{^}}local_load_v8i16: -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}} +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -65,8 +65,8 @@ } ; FUNC-LABEL: {{^}}local_load_v16i16: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; EG: LDS_READ_RET @@ -256,7 +256,7 @@ ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: ; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} ; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in @@ -280,7 +280,7 @@ ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll @@ -36,7 +36,7 @@ } ; FUNC-LABEL: {{^}}local_load_v4i32: -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}} +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { entry: @@ -46,8 +46,8 @@ } ; FUNC-LABEL: {{^}}local_load_v8i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in @@ -58,7 +58,7 @@ ; FUNC-LABEL: {{^}}local_load_v16i32: ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:7{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { entry: Index: llvm/trunk/test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-local-i8.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-local-i8.ll @@ -64,8 +64,8 @@ } ; FUNC-LABEL: {{^}}local_load_v16i8: -; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset0:1{{$}} +; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: llvm/trunk/test/CodeGen/AMDGPU/local-64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-64.ll @@ -122,7 +122,7 @@ ; BOTH-LABEL: {{^}}local_v2i64_store: ; BOTH-NOT: ADD -; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:15 offset1:14 +; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 ; BOTH: s_endpgm define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 @@ -132,7 +132,7 @@ ; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: ; BOTH-NOT: ADD -; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1 +; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 ; BOTH: s_endpgm define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 @@ -141,8 +141,8 @@ ; BOTH-LABEL: {{^}}local_v4i64_store: ; BOTH-NOT: ADD -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:31 offset1:30 -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:29 offset1:28 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 ; BOTH: s_endpgm define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 @@ -152,8 +152,8 @@ ; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: ; BOTH-NOT: ADD -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:3 offset1:2 -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 ; BOTH: s_endpgm define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16