Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1484,7 +1484,7 @@ if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); } EVT MemVT = Store->getMemoryVT(); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1183,7 +1183,8 @@ break; // fall-through case AMDGPUAS::LOCAL_ADDRESS: - return ScalarizeVectorLoad(Op, DAG); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); } } Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -89,8 +89,13 @@ } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} + +; FIXME: These moves shouldn't be necessary, it should be able to +; store the same register if offset1 was the non-zero offset. + +; CI: v_mov_b32 +; CI: v_mov_b32 ; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { @@ -103,8 +108,9 @@ } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: v_mov_b32 +; CI: v_mov_b32 ; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { @@ -118,13 +124,11 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v8f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT4:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}} ; CI: v_mov_b32 ; CI: buffer_store_dwordx4 ; CI: buffer_store_dwordx4 @@ -140,21 +144,15 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}} -; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:15{{$}} -; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:5{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT9:[0-9]+]]:[[REG_ELT8:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:10 offset1:9{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:2{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:14 offset1:13{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:12 offset1:8{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} ; CI: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}} ; CI: v_mov_b32 ; CI: s_waitcnt lgkmcnt(0) Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -29,14 +29,10 @@ ; SI-ALLOCA: buffer_store_dwordx4 ; SI-ALLOCA: buffer_load_dwordx4 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +; SI-PROMOTE: ds_read_b64 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 %array = alloca <2 x double>, i32 16, align 16 @@ -71,14 +67,10 @@ ; SI-ALLOCA: buffer_store_dwordx4 ; SI-ALLOCA: buffer_load_dwordx4 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +; SI-PROMOTE: ds_read_b64 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %array = alloca <2 x i64>, i32 16, align 16 Index: test/CodeGen/AMDGPU/llvm.memcpy.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.memcpy.ll +++ test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -132,32 +132,15 @@ } ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_read2_b32 +; SI: ds_read2_b32 +; SI: ds_read2_b32 +; SI: ds_read2_b32 -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_write2_b32 +; SI: ds_write2_b32 +; SI: ds_write2_b32 +; SI: ds_write2_b32 ; SI: s_endpgm define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { @@ -170,32 +153,15 @@ ; FIXME: Use 64-bit ops ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_read_b64 -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_write_b64 +; SI: ds_write_b64 +; SI: ds_write_b64 +; SI: ds_write_b64 ; SI-DAG: s_endpgm define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -539,10 +539,15 @@ } ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 +; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 +; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 + +; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 +; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 + +; GCN: s_endpgm define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 Index: test/CodeGen/AMDGPU/store.ll =================================================================== --- test/CodeGen/AMDGPU/store.ll +++ test/CodeGen/AMDGPU/store.ll @@ -287,16 +287,33 @@ ; CM: LDS_WRITE ; CM: LDS_WRITE -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 +; SI: ds_write_b64 +; SI: ds_write_b64 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out ret void } +; FUNC-LABEL: {{^}}store_local_v4i32_align4: +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE + +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE + +; SI: ds_write2_b32 +; SI: ds_write2_b32 +define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}store_local_i64_i8: ; EG: LDS_BYTE_WRITE ; SI: ds_write_b8