Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -206,7 +206,7 @@ [ImmArg<0>, IntrNoMem, IntrHasSideEffects]>; def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">, - Intrinsic<[], [], [IntrConvergent]>; + Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent]>; def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">, Intrinsic<[], [], [IntrConvergent]>; Index: llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td @@ -1076,8 +1076,6 @@ [(int_amdgcn_s_barrier)]> { let SchedRW = [WriteBarrier]; let simm16 = 0; - let mayLoad = 1; - let mayStore = 1; let isConvergent = 1; } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -16,9 +16,9 @@ ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 @@ -37,8 +37,8 @@ ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3 ; VARIANT1-NEXT: s_barrier +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) @@ -50,46 +50,46 @@ ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: v_mov_b32_e32 v2, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 -; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; VARIANT2-NEXT: global_store_dword v[1:2], v0, off +; VARIANT2-NEXT: v_mov_b32_e32 v4, s3 +; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0 +; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 +; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] +; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; VARIANT2-NEXT: global_store_dword v[3:4], v0, off +; VARIANT2-NEXT: v_mov_b32_e32 v5, s3 +; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 +; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier -; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0 -; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; VARIANT2-NEXT: v_mov_b32_e32 v0, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc -; VARIANT2-NEXT: global_load_dword v0, v[3:4], off +; VARIANT2-NEXT: global_load_dword v0, v[0:1], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v[1:2], v0, off +; VARIANT2-NEXT: global_store_dword v[3:4], v0, off ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: v_mov_b32_e32 v2, s3 -; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 -; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; VARIANT3-NEXT: global_store_dword v[1:2], v0, off -; VARIANT3-NEXT: s_barrier -; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0 -; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; VARIANT3-NEXT: v_mov_b32_e32 v0, s3 +; VARIANT3-NEXT: v_mov_b32_e32 v4, s3 +; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0 +; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc -; VARIANT3-NEXT: global_load_dword v0, v[3:4], off +; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] +; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; VARIANT3-NEXT: global_store_dword v[3:4], v0, off +; VARIANT3-NEXT: v_mov_b32_e32 v5, s3 +; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 +; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc +; VARIANT3-NEXT: s_barrier +; VARIANT3-NEXT: global_load_dword v0, v[0:1], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v[1:2], v0, off +; VARIANT3-NEXT: global_store_dword v[3:4], v0, off ; VARIANT3-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -38,18 +38,16 @@ ; GCN-LABEL: {{^}}local_memory_two_objects: ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 +; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 - -; GCN: s_barrier - ; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] ; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] +; GCN: s_barrier + ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] - -; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/schedule-barrier.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/schedule-barrier.mir +++ llvm/trunk/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -0,0 +1,46 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s + +--- +# Check that the high latency loads are both scheduled first, before the +# multiplies, despite the presence of a barrier in the function. +# CHECK: BUFFER_LOAD_DWORD_OFFSET +# CHECK: BUFFER_LOAD_DWORD_OFFSET +# CHECK: V_MUL_LO_U32 +# CHECK: V_MUL_LO_U32 +name: test +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + + undef %43.sub3:vreg_128 = COPY $vgpr9 + undef %42.sub2:vreg_128 = COPY $vgpr8 + undef %41.sub1:vreg_128 = COPY $vgpr7 + undef %26.sub0:vreg_128 = COPY $vgpr6 + undef %46.sub3:vreg_128 = COPY $vgpr5 + undef %45.sub2:vreg_128 = COPY $vgpr4 + undef %44.sub1:vreg_128 = COPY $vgpr3 + undef %32.sub0:vreg_128 = COPY $vgpr2 + undef %38.sub1:vreg_64 = COPY $vgpr1 + %38.sub0:vreg_64 = COPY $vgpr0 + + S_BARRIER + + undef %33.sub0:sgpr_128 = V_READFIRSTLANE_B32 %32.sub0, implicit $exec + %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec + %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec + %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec + %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, implicit $exec + %39:vgpr_32 = V_MUL_LO_U32 %15, %15, implicit $exec + + undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec + %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec + %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec + %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec + %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, implicit $exec + %40:vgpr_32 = V_MUL_LO_U32 %19, %19, implicit $exec + + %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec + GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 +... Index: llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -304,14 +304,17 @@ ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] + +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] + +; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] + ; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] ; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-vscnt.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -5,8 +5,8 @@ ; GCN-LABEL: barrier_vmcnt_global: ; GFX8: flat_load_dword ; GFX9_10: global_load_dword -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX9_10: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) { bb: @@ -28,9 +28,9 @@ ; GCN-LABEL: barrier_vscnt_global: ; GFX8: flat_store_dword ; GFX9_10: global_store_dword -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX9-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX9: s_waitcnt vmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) { bb: @@ -54,9 +54,9 @@ ; GCN-LABEL: barrier_vmcnt_vscnt_global: ; GFX8: flat_load_dword ; GFX9_10: global_load_dword -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX9_10: s_waitcnt vmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) { bb: @@ -81,7 +81,7 @@ ; GCN-LABEL: barrier_vmcnt_flat: ; GCN: flat_load_dword -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) { bb: @@ -102,9 +102,9 @@ ; GCN-LABEL: barrier_vscnt_flat: ; GCN: flat_store_dword -; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) { bb: @@ -127,8 +127,8 @@ ; GCN-LABEL: barrier_vmcnt_vscnt_flat: ; GCN: flat_load_dword -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) { bb: @@ -153,8 +153,8 @@ ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup: ; GCN: flat_load_dword -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) { bb: @@ -218,8 +218,8 @@ ; GCN-LABEL: store_vscnt_private: ; GCN: buffer_store_dword -; GFX8_9-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX8_9: s_waitcnt vmcnt(0) +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_private(i32 addrspace(5)* %p) { store i32 0, i32 addrspace(5)* %p @@ -229,8 +229,8 @@ ; GCN-LABEL: store_vscnt_global: ; GFX8: flat_store_dword ; GFX9_10: global_store_dword -; GFX8_9-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX8_9: s_waitcnt vmcnt(0) +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_global(i32 addrspace(1)* %p) { store i32 0, i32 addrspace(1)* %p @@ -239,9 +239,9 @@ ; GCN-LABEL: store_vscnt_flat: ; GCN: flat_store_dword -; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_flat(i32* %p) { store i32 0, i32* %p @@ -249,8 +249,8 @@ } ; GCN-LABEL: function_prologue: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @function_prologue() { ret void