Index: test/CodeGen/AMDGPU/insert-waitcnts-callee.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert-waitcnts-callee.mir @@ -0,0 +1,25 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s +--- | + define float @entry_callee_wait(float %arg) #0 { + ret float %arg + } + + attributes #0 = { nounwind } +... +--- +# CHECK-LABEL: name: entry_callee_wait{{$}} +# CHECK: bb.0: +# CHECK-NEXT: S_WAITCNT 0{{$}} +# CHECK-NEXT: V_ADD_F32 +# CHECK-NEXT: S_SETPC_B64 +liveins: + - { reg: '$sgpr0_sgpr1' } + - { reg: '$vgpr0' } + +name: entry_callee_wait +body: | + bb.0: + $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64 killed $sgpr0_sgpr1 + +... Index: test/CodeGen/AMDGPU/insert-waitcnts-exp.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert-waitcnts-exp.mir @@ -0,0 +1,63 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s +--- | + define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x + i32> inreg, i32 inreg %w, float %v) #0 { + %a = load volatile float, float addrspace(1)* undef + %b = load volatile float, float addrspace(1)* undef + %c = load volatile float, float addrspace(1)* undef + %d = load volatile float, float addrspace(1)* undef + call void @llvm.amdgcn.exp.f32(i32 15, i32 1, float %a, float %b, float %c, float %d, i1 true, i1 false) + ret <4 x float> + } + + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + + attributes #0 = { nounwind } + +... +--- + +# CHECK-LABEL: name: exp_done_waitcnt{{$}} +# CHECK: EXP_DONE +# CHECK-NEXT: S_WAITCNT 3855 +# CHECK: $vgpr0 = V_MOV_B32 +# CHECK: $vgpr1 = V_MOV_B32 +# CHECK: $vgpr2 = V_MOV_B32 +# CHECK: $vgpr3 = V_MOV_B32 +name: exp_done_waitcnt +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.2): + $sgpr3 = S_MOV_B32 61440 + $sgpr2 = S_MOV_B32 -1 + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + EXP_DONE 0, killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, -1, -1, 15, implicit $exec + $vgpr0 = V_MOV_B32_e32 1056964608, implicit $exec + $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec + $vgpr2 = V_MOV_B32_e32 1073741824, implicit $exec + $vgpr3 = V_MOV_B32_e32 1082130432, implicit $exec + SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3 + +... Index: test/CodeGen/AMDGPU/insert-waits-callee.mir =================================================================== --- test/CodeGen/AMDGPU/insert-waits-callee.mir +++ /dev/null @@ -1,25 +0,0 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s ---- | - define float @entry_callee_wait(float %arg) #0 { - ret float %arg - } - - attributes #0 = { nounwind } -... ---- -# CHECK-LABEL: name: entry_callee_wait{{$}} -# CHECK: bb.0: -# CHECK-NEXT: S_WAITCNT 0{{$}} -# CHECK-NEXT: V_ADD_F32 -# CHECK-NEXT: S_SETPC_B64 -liveins: - - { reg: '$sgpr0_sgpr1' } - - { reg: '$vgpr0' } - -name: entry_callee_wait -body: | - bb.0: - $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec - S_SETPC_B64 killed $sgpr0_sgpr1 - -... Index: test/CodeGen/AMDGPU/insert-waits-exp.mir =================================================================== --- test/CodeGen/AMDGPU/insert-waits-exp.mir +++ /dev/null @@ -1,63 +0,0 @@ -# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s ---- | - define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x - i32> inreg, i32 inreg %w, float %v) #0 { - %a = load volatile float, float addrspace(1)* undef - %b = load volatile float, float addrspace(1)* undef - %c = load volatile float, float addrspace(1)* undef - %d = load volatile float, float addrspace(1)* undef - call void @llvm.amdgcn.exp.f32(i32 15, i32 1, float %a, float %b, float %c, float %d, i1 true, i1 false) - ret <4 x float> - } - - declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - - attributes #0 = { nounwind } - -... ---- - -# CHECK-LABEL: name: exp_done_waitcnt{{$}} -# CHECK: EXP_DONE -# CHECK-NEXT: S_WAITCNT 3855 -# CHECK: $vgpr0 = V_MOV_B32 -# CHECK: $vgpr1 = V_MOV_B32 -# CHECK: $vgpr2 = V_MOV_B32 -# CHECK: $vgpr3 = V_MOV_B32 -name: exp_done_waitcnt -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -body: | - bb.0 (%ir-block.2): - $sgpr3 = S_MOV_B32 61440 - $sgpr2 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - EXP_DONE 0, killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, -1, -1, 15, implicit $exec - $vgpr0 = V_MOV_B32_e32 1056964608, implicit $exec - $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec - $vgpr2 = V_MOV_B32_e32 1073741824, implicit $exec - $vgpr3 = V_MOV_B32_e32 1082130432, implicit $exec - SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3 - -... Index: test/CodeGen/AMDGPU/scalar-store-cache-flush.mir =================================================================== --- test/CodeGen/AMDGPU/scalar-store-cache-flush.mir +++ test/CodeGen/AMDGPU/scalar-store-cache-flush.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -run-pass si-insert-waitcnts %s -o - | FileCheck %s --- | define amdgpu_kernel void @basic_insert_dcache_wb() { Index: test/CodeGen/AMDGPU/waitcnt-permute.mir =================================================================== --- test/CodeGen/AMDGPU/waitcnt-permute.mir +++ test/CodeGen/AMDGPU/waitcnt-permute.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s ... # CHECK-LABEL: name: waitcnt-permute{{$}}