Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -156,6 +156,10 @@ // register spills than just using one of these approaches on its own. Policy.OnlyTopDown = false; Policy.OnlyBottomUp = false; + + // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. + if (!enableSIScheduler()) + Policy.ShouldTrackLaneMasks = true; } } Index: llvm/trunk/test/CodeGen/AMDGPU/and.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/and.ll +++ llvm/trunk/test/CodeGen/AMDGPU/and.ll @@ -282,11 +282,11 @@ ; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} ; SI-NOT: and ; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]] -; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]] ; SI-NOT: and -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]] +; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]] ; SI-NOT: and -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]] define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load volatile i64, i64 addrspace(1)* %aptr %b = load volatile i64, i64 addrspace(1)* %aptr Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -3,11 +3,11 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 @@ -21,12 +21,12 @@ } ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 +; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] Index: llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll +++ llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -2,12 +2,10 @@ ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] - +; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] - ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { %tmp = alloca float @@ -19,7 +17,6 @@ ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] @@ -27,6 +24,7 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, [[FI1]] +; GCN-DAG: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] Index: llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll +++ llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: @commute_add_imm_fabs_f32 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]| -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -21,7 +21,7 @@ ; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]| -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -36,7 +36,7 @@ ; FUNC-LABEL: @commute_mul_imm_fneg_f32 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]] -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -52,7 +52,7 @@ ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]] -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -67,7 +67,7 @@ ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -84,7 +84,7 @@ ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -101,7 +101,7 @@ ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -120,7 +120,7 @@ ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -138,7 +138,7 @@ ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll @@ -116,7 +116,7 @@ ; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]] ; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32 ; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]] -; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]] +; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]] ; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]] ; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] ; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -149,7 +149,7 @@ ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: buffer_load_dword [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI-NEXT: buffer_store_dword [[RESULT]], +; SI: buffer_store_dword [[RESULT]], define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone @@ -162,7 +162,7 @@ ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: buffer_load_dword [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI-NEXT: buffer_store_dword [[RESULT]], +; SI: buffer_store_dword [[RESULT]], define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone Index: llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll @@ -116,9 +116,10 @@ ; FUNC-LABEL: {{^}}ctpop_i64_in_br: ; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd ; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 -; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} +; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} +; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] -; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]] +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]] ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} ; GCN: s_endpgm define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { Index: llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -33,8 +33,8 @@ ; SI-NOT: bfe ; SI-NOT: v_cvt_f32_ubyte3_e32 ; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 -; SI-DAG: v_cvt_f32_ubyte0_e32 +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 Index: llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -85,14 +85,8 @@ } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8: -; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} - -; FIXME: These moves shouldn't be necessary, it should be able to -; store the same register if offset1 was the non-zero offset. - -; CI: v_mov_b32 -; CI: v_mov_b32 -; CI: buffer_store_dwordx4 +; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI: buffer_store_dwordx4 [[REG_ZW]] ; CI: s_endpgm define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -104,10 +98,8 @@ } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg: -; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI: v_mov_b32 -; CI: v_mov_b32 -; CI: buffer_store_dwordx4 +; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI: buffer_store_dwordx4 [[REG_ZW]] ; CI: s_endpgm define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -120,14 +112,10 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v8f32_superreg: -; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} -; CI: v_mov_b32 -; CI: v_mov_b32 -; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}} -; CI: v_mov_b32 -; CI: v_mov_b32 -; CI: buffer_store_dwordx4 -; CI: buffer_store_dwordx4 +; CI-DAG: ds_read2_b64 [[VEC_HI:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-DAG: ds_read2_b64 [[VEC_LO:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 +; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}} ; CI: s_endpgm define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -140,22 +128,15 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: -; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} -; CI: v_mov_b32 -; CI: v_mov_b32 -; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}} -; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}} -; CI: v_mov_b32 -; CI: v_mov_b32 -; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}} -; CI: v_mov_b32 -; CI: v_mov_b32 - +; CI-DAG: ds_read2_b64 [[VEC0_3:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read2_b64 [[VEC4_7:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-DAG: ds_read2_b64 [[VEC8_11:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:4 offset1:5{{$}} +; CI-DAG: ds_read2_b64 [[VEC12_15:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:6 offset1:7{{$}} ; CI: s_waitcnt lgkmcnt(0) -; CI: buffer_store_dwordx4 -; CI: buffer_store_dwordx4 -; CI: buffer_store_dwordx4 -; CI: buffer_store_dwordx4 +; CI-DAG: buffer_store_dwordx4 [[VEC0_3]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}} +; CI-DAG: buffer_store_dwordx4 [[VEC4_7]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 +; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 +; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 ; CI: s_endpgm define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -65,9 +65,9 @@ ; SI-LABEL: @simple_read2st64_f32_over_max_offset ; SI-NOT: ds_read2st64_b32 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} -; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 -; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]] +; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} +; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 +; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}} ; SI: s_endpgm define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll @@ -179,8 +179,8 @@ } ; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27 ; SI: s_endpgm define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -209,8 +209,8 @@ } ; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27 ; SI: s_endpgm define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -13,8 +13,8 @@ ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] ; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff -; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] -; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] +; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] +; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} ; GCN: s_endpgm define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { Index: llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll @@ -13,8 +13,8 @@ ; FUNC-LABEL: {{^}}ffloor_f64: ; CI: v_floor_f64_e32 ; SI: v_fract_f64_e32 -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 +; SI-DAG: v_min_f64 +; SI-DAG: v_cmp_class_f64_e64 ; SI: v_cndmask_b32_e64 ; SI: v_cndmask_b32_e64 ; SI: v_add_f64 @@ -28,8 +28,8 @@ ; FUNC-LABEL: {{^}}ffloor_f64_neg: ; CI: v_floor_f64_e64 ; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]] -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 +; SI-DAG: v_min_f64 +; SI-DAG: v_cmp_class_f64_e64 ; SI: v_cndmask_b32_e64 ; SI: v_cndmask_b32_e64 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]] @@ -44,8 +44,8 @@ ; FUNC-LABEL: {{^}}ffloor_f64_neg_abs: ; CI: v_floor_f64_e64 ; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]| -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 +; SI-DAG: v_min_f64 +; SI-DAG: v_cmp_class_f64_e64 ; SI: v_cndmask_b32_e64 ; SI: v_cndmask_b32_e64 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]| Index: llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -55,8 +55,8 @@ } ; FUNC-LABEL: {{^}}fneg_fabs_f64: -; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}} ; SI: s_load_dwordx2 +; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}} ; SI: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} ; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] ; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll @@ -10,10 +10,11 @@ ; TODO: this constant should be folded: ; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 ; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI: s_mov_b32 s[[LOW:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW]]:[[HIGH1]]] +; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW]]:[[HIGH2]]] +; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] +; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -68,7 +68,6 @@ ; create copies which we don't bother to track here. ; ;CHECK-LABEL: {{^}}test3: -;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc @@ -79,6 +78,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[SOFS]] offset:1 glc define float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) #0 { main_body: Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -187,7 +187,7 @@ ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { @@ -202,7 +202,7 @@ ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { @@ -218,7 +218,7 @@ ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { @@ -234,7 +234,7 @@ ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -28,10 +28,11 @@ ; TODO: this constant should be folded: ; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 ; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI: s_mov_b32 s[[LOW:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW]]:[[HIGH1]]] +; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW]]:[[HIGH2]]] +; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] +; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) store double %rsq_clamp, double addrspace(1)* %out Index: llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll @@ -5,8 +5,8 @@ ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: ; EG: LDS_WRXCHG_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], @@ -31,8 +31,8 @@ ; XXX - Is it really necessary to load 4 into VGPR? ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: ; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], Index: llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll @@ -30,10 +30,10 @@ } ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64_offset: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: buffer_store_dwordx2 [[RESULT]], Index: llvm/trunk/test/CodeGen/AMDGPU/max.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/max.ll +++ llvm/trunk/test/CodeGen/AMDGPU/max.ll @@ -191,8 +191,8 @@ } ; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32: -; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15 -; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23 +; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15 +; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23 define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { %cmp = icmp ugt <2 x i32> %a, %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> @@ -205,8 +205,8 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] -; SI-NEXT: buffer_store_dword [[VMAX]] +; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI: buffer_store_dword [[VMAX]] define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 @@ -223,8 +223,8 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] -; SI-NEXT: buffer_store_dword [[VMAX]] +; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI: buffer_store_dword [[VMAX]] define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 Index: llvm/trunk/test/CodeGen/AMDGPU/min.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/min.ll +++ llvm/trunk/test/CodeGen/AMDGPU/min.ll @@ -301,8 +301,8 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] +; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI: buffer_store_dword [[VMIN]] define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 @@ -319,8 +319,8 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] +; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI: buffer_store_dword [[VMIN]] define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 Index: llvm/trunk/test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/mubuf.ll +++ llvm/trunk/test/CodeGen/AMDGPU/mubuf.ll @@ -8,7 +8,7 @@ ; MUBUF load with an immediate byte offset that fits into 12-bits ; CHECK-LABEL: {{^}}mubuf_load0: -; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 +; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1 @@ -19,7 +19,7 @@ ; MUBUF load with the largest possible immediate offset ; CHECK-LABEL: {{^}}mubuf_load1: -; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 +; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095 Index: llvm/trunk/test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/setcc-opt.ll +++ llvm/trunk/test/CodeGen/AMDGPU/setcc-opt.ll @@ -216,7 +216,7 @@ ; GCN: buffer_load_sbyte [[B:v[0-9]+]] ; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}} ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { %b = load i8, i8 addrspace(1)* %b.ptr Index: llvm/trunk/test/CodeGen/AMDGPU/setcc64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/setcc64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/setcc64.ll @@ -59,7 +59,7 @@ ; FUNC-LABEL: {{^}}f64_one: ; SI: v_cmp_lg_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) { entry: %0 = fcmp one double %a, %b @@ -80,7 +80,7 @@ ; FUNC-LABEL: {{^}}f64_ueq: ; SI: v_cmp_nlg_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) { entry: %0 = fcmp ueq double %a, %b @@ -92,7 +92,7 @@ ; FUNC-LABEL: {{^}}f64_ugt: ; SI: v_cmp_nle_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) { entry: %0 = fcmp ugt double %a, %b @@ -103,7 +103,7 @@ ; FUNC-LABEL: {{^}}f64_uge: ; SI: v_cmp_nlt_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) { entry: %0 = fcmp uge double %a, %b @@ -114,7 +114,7 @@ ; FUNC-LABEL: {{^}}f64_ult: ; SI: v_cmp_nge_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) { entry: %0 = fcmp ult double %a, %b @@ -125,7 +125,7 @@ ; FUNC-LABEL: {{^}}f64_ule: ; SI: v_cmp_ngt_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) { entry: %0 = fcmp ule double %a, %b Index: llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll @@ -1,4 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=SI --misched=si < %s | FileCheck %s +; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so +; we need to disable this when the si scheduler is being used. +; The only way the subtarget knows that the si machine scheduler is being used +; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend +; won't know what scheduler we are using. +; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s ; The test checks the "si" machine scheduler pass works correctly. Index: llvm/trunk/test/CodeGen/AMDGPU/sra.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sra.ll +++ llvm/trunk/test/CodeGen/AMDGPU/sra.ll @@ -230,8 +230,8 @@ ; GCN-LABEL: {{^}}s_ashr_63_i64: ; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 -; GCN: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]] -; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} +; GCN-DAG: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]] +; GCN-DAG: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} ; GCN: s_addc_u32 {{s[0-9]+}}, s[[COPYSHIFT]], {{s[0-9]+}} define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 63 Index: llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -8,14 +8,12 @@ ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v1 +; CHECK: v_cmp_eq_i32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; BB0_1: ; CHECK: s_load_dword s6, s[0:1], 0xa ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; BB0_2: ; CHECK: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_mov_b32 s7, 0xf000 Index: llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll +++ llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll @@ -22,7 +22,7 @@ ; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] -; SI-NEXT: buffer_store_byte [[RESULT]] +; SI: buffer_store_byte [[RESULT]] define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 @@ -45,7 +45,7 @@ ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1: ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] +; SI: buffer_store_byte [[RESULT]] define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 @@ -57,7 +57,7 @@ ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1: ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] +; SI: buffer_store_byte [[RESULT]] define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 @@ -81,7 +81,7 @@ ; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0: ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] +; SI: buffer_store_byte [[RESULT]] define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 @@ -93,7 +93,7 @@ ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0: ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] +; SI: buffer_store_byte [[RESULT]] define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 @@ -119,7 +119,7 @@ ; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] -; SI-NEXT: buffer_store_byte [[RESULT]] +; SI: buffer_store_byte [[RESULT]] define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 @@ -158,7 +158,7 @@ ; SI: buffer_load_sbyte [[LOAD:v[0-9]+]] ; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}} ; SI-NEXT: v_cndmask_b32_e64 -; SI-NEXT: buffer_store_byte +; SI: buffer_store_byte define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { %load = load i8, i8 addrspace(1)* %in %masked = and i8 %load, 255 Index: llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -28,10 +28,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] ; GCN: buffer_store_dword [[RESULT]] @@ -42,13 +42,13 @@ } ; GCN-LABEL: {{^}}test_use_s_v_s: -; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} - ; GCN: buffer_load_dword [[VA0:v[0-9]+]] ; GCN-NOT: v_mov_b32 ; GCN: buffer_load_dword [[VA1:v[0-9]+]] +; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} + ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; GCN-NOT: v_mov_b32 @@ -68,10 +68,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] @@ -82,10 +82,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]]