Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -56,7 +56,7 @@ // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,"; FullFS += FS; Index: test/CodeGen/AMDGPU/address-space.ll =================================================================== --- test/CodeGen/AMDGPU/address-space.ll +++ test/CodeGen/AMDGPU/address-space.ll @@ -8,8 +8,7 @@ ; CHECK-LABEL: {{^}}do_as_ptr_calcs: ; CHECK: s_load_dword [[SREG1:s[0-9]+]], ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12 -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20 +; CHECK-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { entry: %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 Index: test/CodeGen/AMDGPU/ctpop.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop.ll +++ test/CodeGen/AMDGPU/ctpop.ll @@ -62,7 +62,7 @@ ; GCN: buffer_load_dword [[VAL0:v[0-9]+]], ; GCN: s_waitcnt ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} -; GCN-NEXT: buffer_store_dword [[RESULT]], +; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { %val0 = load i32, i32 addrspace(1)* %in0, align 4 @@ -203,8 +203,8 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], Index: test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll +++ test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll @@ -7,8 +7,7 @@ ; GCN-LABEL: {{^}}reschedule_global_load_lds_store: ; GCN: buffer_load_dword ; GCN: buffer_load_dword -; GCN: ds_write_b32 -; GCN: ds_write_b32 +; GCN: ds_write2_b32 ; GCN: s_endpgm define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 { entry: Index: test/CodeGen/AMDGPU/fceil64.ll =================================================================== --- test/CodeGen/AMDGPU/fceil64.ll +++ test/CodeGen/AMDGPU/fceil64.ll @@ -13,8 +13,8 @@ ; CI: v_ceil_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] +; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]] ; SI-DAG: s_not_b64 ; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 Index: test/CodeGen/AMDGPU/fmax3.ll =================================================================== --- test/CodeGen/AMDGPU/fmax3.ll +++ test/CodeGen/AMDGPU/fmax3.ll @@ -11,9 +11,9 @@ ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 + %a = load volatile float, float addrspace(1)* %aptr, align 4 + %b = load volatile float, float addrspace(1)* %bptr, align 4 + %c = load volatile float, float addrspace(1)* %cptr, align 4 %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone store float %f1, float addrspace(1)* %out, align 4 @@ -29,9 +29,9 @@ ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 + %a = load volatile float, float addrspace(1)* %aptr, align 4 + %b = load volatile float, float addrspace(1)* %bptr, align 4 + %c = load volatile float, float addrspace(1)* %cptr, align 4 %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone store float %f1, float addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/fmin3.ll =================================================================== --- test/CodeGen/AMDGPU/fmin3.ll +++ test/CodeGen/AMDGPU/fmin3.ll @@ -12,9 +12,9 @@ ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 + %a = load volatile float, float addrspace(1)* %aptr, align 4 + %b = load volatile float, float addrspace(1)* %bptr, align 4 + %c = load volatile float, float addrspace(1)* %cptr, align 4 %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone store float %f1, float addrspace(1)* %out, align 4 @@ -30,9 +30,9 @@ ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 + %a = load volatile float, float addrspace(1)* %aptr, align 4 + %b = load volatile float, float addrspace(1)* %bptr, align 4 + %c = load volatile float, float addrspace(1)* %cptr, align 4 %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone store float %f1, float addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/global-extload-i1.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i1.ll +++ test/CodeGen/AMDGPU/global-extload-i1.ll @@ -153,8 +153,8 @@ ; } ; FUNC-LABEL: {{^}}zextload_global_i1_to_i64: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], -; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} +; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]], +; SI-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; SI: buffer_store_dwordx2 define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %a = load i1, i1 addrspace(1)* %in Index: test/CodeGen/AMDGPU/global-extload-i16.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i16.ll +++ test/CodeGen/AMDGPU/global-extload-i16.ll @@ -154,8 +154,8 @@ } ; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: -; SI: buffer_load_ushort v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]], +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in Index: test/CodeGen/AMDGPU/global-extload-i32.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i32.ll +++ test/CodeGen/AMDGPU/global-extload-i32.ll @@ -3,8 +3,8 @@ ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}zextload_global_i32_to_i64: -; SI: buffer_load_dword v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI-DAG: buffer_load_dword v[[LO:[0-9]+]], +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %a = load i32, i32 addrspace(1)* %in Index: test/CodeGen/AMDGPU/global-extload-i8.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i8.ll +++ test/CodeGen/AMDGPU/global-extload-i8.ll @@ -151,8 +151,8 @@ ; } ; FUNC-LABEL: {{^}}zextload_global_i8_to_i64: -; SI: buffer_load_ubyte v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI-DAG: buffer_load_ubyte v[[LO:[0-9]+]], +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { %a = load i8, i8 addrspace(1)* %in Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s declare void @llvm.amdgcn.s.barrier() #0 @@ -18,6 +18,8 @@ ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 +; CI-PROMOTE: ds_write_b64 +; CI-PROMOTE: ds_read_b64 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 { %val = load double, double addrspace(1)* %in, align 8 %array = alloca [16 x double], align 8 @@ -47,6 +49,8 @@ ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 ; SI-PROMOTE: ds_read_b64 +; CI-PROMOTE: ds_write2_b64 +; CI-PROMOTE: ds_read2_b64 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 %array = alloca [8 x <2 x double>], align 16 @@ -71,6 +75,8 @@ ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 +; CI-PROMOTE: ds_write_b64 +; CI-PROMOTE: ds_read_b64 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 { %val = load i64, i64 addrspace(1)* %in, align 8 %array = alloca [8 x i64], align 8 @@ -101,6 +107,8 @@ ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 ; SI-PROMOTE: ds_read_b64 +; CI-PROMOTE: ds_write2_b64 +; CI-PROMOTE: ds_read2_b64 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %array = alloca [8 x <2 x i64>], align 16 Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll @@ -8,13 +8,12 @@ ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}] ; TODO: this constant should be folded: -; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 +; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] -; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] +; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]] +; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] +; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -25,14 +25,13 @@ ; SI: v_rsq_clamp_f64_e32 ; TODO: this constant should be folded: -; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 +; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} -; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] +; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]] +; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] +; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) store double %rsq_clamp, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.memcpy.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.memcpy.ll +++ test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -153,15 +153,11 @@ ; FIXME: Use 64-bit ops ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: -; SI: ds_read_b64 -; SI: ds_read_b64 -; SI: ds_read_b64 -; SI: ds_read_b64 - -; SI: ds_write_b64 -; SI: ds_write_b64 -; SI: ds_write_b64 -; SI: ds_write_b64 +; SI: ds_read2_b64 +; SI: ds_read2_b64 + +; SI: ds_write2_b64 +; SI: ds_write2_b64 ; SI-DAG: s_endpgm define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { Index: test/CodeGen/AMDGPU/load-i1.ll =================================================================== --- test/CodeGen/AMDGPU/load-i1.ll +++ test/CodeGen/AMDGPU/load-i1.ll @@ -85,8 +85,8 @@ } ; FUNC-LABEL: {{^}}global_zextload_i1_to_i64: -; SI: buffer_load_ubyte -; SI: v_mov_b32_e32 {{v[0-9]+}}, 0 +; SI-DAG: buffer_load_ubyte +; SI-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0 ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { Index: test/CodeGen/AMDGPU/local-64.ll =================================================================== --- test/CodeGen/AMDGPU/local-64.ll +++ test/CodeGen/AMDGPU/local-64.ll @@ -122,8 +122,7 @@ ; BOTH-LABEL: {{^}}local_v2i64_store: ; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 +; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:15 offset1:14 ; BOTH: s_endpgm define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 @@ -133,8 +132,7 @@ ; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: ; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 +; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1 ; BOTH: s_endpgm define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 @@ -143,10 +141,8 @@ ; BOTH-LABEL: {{^}}local_v4i64_store: ; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:31 offset1:30 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:29 offset1:28 ; BOTH: s_endpgm define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 @@ -156,10 +152,8 @@ ; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: ; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:3 offset1:2 +; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1 ; BOTH: s_endpgm define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 Index: test/CodeGen/AMDGPU/local-memory-two-objects.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -32,9 +32,7 @@ ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] -; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 -; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] - +; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:4 define void @local_memory_two_objects(i32 addrspace(1)* %out) { entry: %x.i = call i32 @llvm.r600.read.tidig.x() #0 Index: test/CodeGen/AMDGPU/missing-store.ll =================================================================== --- test/CodeGen/AMDGPU/missing-store.ll +++ test/CodeGen/AMDGPU/missing-store.ll @@ -7,8 +7,8 @@ ; FUNC-LABEL: {{^}}missing_store_reduced: ; SI: ds_read_b64 -; SI: buffer_store_dword -; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} +; SI-DAG: buffer_store_dword +; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} ; SI: s_load_dword ; SI: s_nop 2 Index: test/CodeGen/AMDGPU/reorder-stores.ll =================================================================== --- test/CodeGen/AMDGPU/reorder-stores.ll +++ test/CodeGen/AMDGPU/reorder-stores.ll @@ -16,10 +16,8 @@ } ; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store: -; SI: ds_read_b64 -; SI: ds_read_b64 -; SI: ds_write_b64 -; SI: ds_write_b64 +; SI: ds_read2_b64 +; SI: ds_write2_b64 ; SI: s_endpgm define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 Index: test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- test/CodeGen/AMDGPU/sdiv.ll +++ test/CodeGen/AMDGPU/sdiv.ll @@ -34,8 +34,8 @@ ; working. ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]] ; SI: v_add_i32 ; SI: v_lshrrev_b32 Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i64-opts.ll +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -40,8 +40,8 @@ } ; GCN-LABEL: {{^}}lshr_i64_32: -; GCN: buffer_load_dword v[[LO:[0-9]+]] -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in @@ -81,8 +81,8 @@ } ; GCN-LABEL: {{^}}shl_i64_const_32: -; GCN: buffer_load_dword v[[HI:[0-9]+]] -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -10,8 +10,7 @@ @stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 ; FUNC-LABEL: @reorder_local_load_global_store_local_load -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 +; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 ; CI: buffer_store_dword define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 @@ -71,8 +70,8 @@ } ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load -; CI: buffer_store_dword -; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} +; CI-DAG: buffer_store_dword +; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3 @@ -156,8 +155,7 @@ } ; FUNC-LABEL: @reorder_local_offsets -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 +; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 ; CI: buffer_store_dword Index: test/CodeGen/AMDGPU/store.ll =================================================================== --- test/CodeGen/AMDGPU/store.ll +++ test/CodeGen/AMDGPU/store.ll @@ -287,8 +287,7 @@ ; CM: LDS_WRITE ; CM: LDS_WRITE -; SI: ds_write_b64 -; SI: ds_write_b64 +; SI: ds_write2_b64 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -42,13 +42,13 @@ } ; GCN-LABEL: {{^}}test_use_s_v_s: +; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} + ; GCN: buffer_load_dword [[VA0:v[0-9]+]] ; GCN-NOT: v_mov_b32 ; GCN: buffer_load_dword [[VA1:v[0-9]+]] -; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} - ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; GCN-NOT: v_mov_b32 Index: test/CodeGen/AMDGPU/xor.ll =================================================================== --- test/CodeGen/AMDGPU/xor.ll +++ test/CodeGen/AMDGPU/xor.ll @@ -64,8 +64,8 @@ ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] ; SI: buffer_store_byte [[RESULT]] define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { - %a = load i1, i1 addrspace(1)* %in0 - %b = load i1, i1 addrspace(1)* %in1 + %a = load volatile i1, i1 addrspace(1)* %in0 + %b = load volatile i1, i1 addrspace(1)* %in1 %xor = xor i1 %a, %b store i1 %xor, i1 addrspace(1)* %out ret void