Changeset View
Changeset View
Standalone View
Standalone View
llvm/test/CodeGen/AMDGPU/sdiv.ll
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||||
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn | FileCheck %s -check-prefixes=FUNC,SI,GCN | ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx600 | FileCheck %s -check-prefixes=FUNC,SI,GCN | ||||
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,TONGA | ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,TONGA | ||||
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,GFX9 | ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,GFX9 | ||||
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s -check-prefixes=FUNC,EG | ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s -check-prefixes=FUNC,EG | ||||
; The code generated by sdiv is long and complex and may frequently change. | ; The code generated by sdiv is long and complex and may frequently change. | ||||
; The goal of this test is to make sure the ISel doesn't fail. | ; The goal of this test is to make sure the ISel doesn't fail. | ||||
; | ; | ||||
; This program was previously failing to compile when one of the selectcc | ; This program was previously failing to compile when one of the selectcc | ||||
; opcodes generated by the sdiv lowering was being legalized and optimized to: | ; opcodes generated by the sdiv lowering was being legalized and optimized to: | ||||
; selectcc Remainder -1, 0, -1, SETGT | ; selectcc Remainder -1, 0, -1, SETGT | ||||
; This was fixed by adding an additional pattern in R600Instructions.td to | ; This was fixed by adding an additional pattern in R600Instructions.td to | ||||
; match this pattern with a CNDGE_INT. | ; match this pattern with a CNDGE_INT. | ||||
define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { | define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { | ||||
; GCN-LABEL: sdiv_i32: | ; GCN-LABEL: sdiv_i32: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s7, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s6, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | ; GCN-NEXT: s_mov_b32 s2, s6 | ||||
; GCN-NEXT: s_mov_b32 s11, s7 | ; GCN-NEXT: s_mov_b32 s3, s7 | ||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s0, s10 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s1, s11 | ||||
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 | ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s4, s8 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: s_mov_b32 s5, s9 | ||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 | ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 | ||||
; GCN-NEXT: v_xor_b32_e32 v4, v2, v3 | ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 | ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 | ||||
; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v0 | ||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0 | ||||
; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v6 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 | ; GCN-NEXT: v_xor_b32_e32 v2, v6, v2 | ||||
; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 | ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 | ||||
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 | ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 | ||||
; GCN-NEXT: v_mul_hi_u32 v3, v2, v1 | ; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 | ||||
; GCN-NEXT: v_mul_lo_u32 v5, v2, v1 | ; GCN-NEXT: v_mul_hi_u32 v5, v3, v1 | ||||
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 | ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 | ||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] | ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] | ||||
; GCN-NEXT: v_mul_hi_u32 v3, v3, v2 | ; GCN-NEXT: v_mul_hi_u32 v4, v4, v3 | ||||
; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v2 | ; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v3 | ||||
; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 | ; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v4, v3 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] | ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] | ||||
; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 | ; GCN-NEXT: v_mul_hi_u32 v3, v3, v0 | ||||
; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 | ; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 | ||||
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 | ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 | ||||
; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v2 | ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v3 | ||||
; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v0 | ; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v4, v0 | ||||
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 | ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1 | ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1 | ||||
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc | ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc | ||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v5, s[0:1] | ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] | ||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc | ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc | ||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 | ||||
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 | ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 | ||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: sdiv_i32: | ; TONGA-LABEL: sdiv_i32: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s7, 0xf000 | ; TONGA-NEXT: s_mov_b32 s7, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s6, -1 | ; TONGA-NEXT: s_mov_b32 s6, -1 | ||||
▲ Show 20 Lines • Show All 140 Lines • ▼ Show 20 Lines | ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) | ||||
%result = sdiv i32 %num, %den | %result = sdiv i32 %num, %den | ||||
store i32 %result, i32 addrspace(1)* %out | store i32 %result, i32 addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { | define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { | ||||
; GCN-LABEL: sdiv_i32_4: | ; GCN-LABEL: sdiv_i32_4: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | |||||
; GCN-NEXT: s_mov_b32 s11, s7 | |||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s0, s4 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 | ; GCN-NEXT: s_mov_b32 s4, s6 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s5, s7 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: s_mov_b32 s6, s2 | ||||
; GCN-NEXT: s_mov_b32 s7, s3 | |||||
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 | |||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 | ; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 | ||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: sdiv_i32_4: | ; TONGA-LABEL: sdiv_i32_4: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s2, -1 | ; TONGA-NEXT: s_mov_b32 s2, -1 | ||||
; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ||||
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines | |||||
} | } | ||||
; Multiply by a weird constant to make sure setIntDivIsCheap is | ; Multiply by a weird constant to make sure setIntDivIsCheap is | ||||
; working. | ; working. | ||||
define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { | define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { | ||||
; GCN-LABEL: slow_sdiv_i32_3435: | ; GCN-LABEL: slow_sdiv_i32_3435: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | ; GCN-NEXT: s_mov_b32 s10, s2 | ||||
; GCN-NEXT: s_mov_b32 s11, s7 | ; GCN-NEXT: s_mov_b32 s11, s3 | ||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s8, s6 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s9, s7 | ||||
; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 | ; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 | ||||
; GCN-NEXT: s_mov_b32 s2, 0x98a1930b | ; GCN-NEXT: s_mov_b32 s0, 0x98a1930b | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | |||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_mul_hi_i32 v1, v0, s2 | ; GCN-NEXT: v_mul_hi_i32 v1, v0, s0 | ||||
; GCN-NEXT: s_mov_b32 s0, s4 | |||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 | ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 | ||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: slow_sdiv_i32_3435: | ; TONGA-LABEL: slow_sdiv_i32_3435: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s2, -1 | ; TONGA-NEXT: s_mov_b32 s2, -1 | ||||
; TONGA-NEXT: s_mov_b32 s10, s2 | ; TONGA-NEXT: s_mov_b32 s10, s2 | ||||
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines | |||||
} | } | ||||
define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { | define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { | ||||
; GCN-LABEL: sdiv_v2i32: | ; GCN-LABEL: sdiv_v2i32: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s11, 0xf000 | ; GCN-NEXT: s_mov_b32 s11, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s10, -1 | ; GCN-NEXT: s_mov_b32 s10, -1 | ||||
; GCN-NEXT: s_mov_b32 s6, s10 | ; GCN-NEXT: s_mov_b32 s4, 0x4f800000 | ||||
; GCN-NEXT: s_mov_b32 s7, s11 | |||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s4, s2 | |||||
; GCN-NEXT: s_mov_b32 s5, s3 | |||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 | |||||
; GCN-NEXT: s_mov_b32 s2, 0x4f800000 | |||||
; GCN-NEXT: s_mov_b32 s8, s0 | ; GCN-NEXT: s_mov_b32 s8, s0 | ||||
; GCN-NEXT: s_mov_b32 s9, s1 | ; GCN-NEXT: s_mov_b32 s9, s1 | ||||
; GCN-NEXT: s_mov_b32 s0, s2 | |||||
; GCN-NEXT: s_mov_b32 s1, s3 | |||||
; GCN-NEXT: s_mov_b32 s2, s10 | |||||
; GCN-NEXT: s_mov_b32 s3, s11 | |||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 | |||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 | ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 | ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 | ||||
; GCN-NEXT: v_xor_b32_e32 v8, v4, v5 | |||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 | |||||
; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 | ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 | ||||
; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 | ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 | ||||
; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 | |||||
; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 | ; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 | ||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 | |||||
; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 | ; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 | ||||
; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 | ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 | ||||
; GCN-NEXT: v_xor_b32_e32 v8, v4, v5 | |||||
; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 | |||||
; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 | ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 | ; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 | ; GCN-NEXT: v_cvt_f32_u32_e32 v7, v3 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 | |||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 | ||||
; GCN-NEXT: v_mul_f32_e32 v4, s2, v4 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 | ||||
; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v7 | |||||
; GCN-NEXT: v_mul_f32_e32 v4, s4, v5 | |||||
; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 | ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 | ||||
; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 | |||||
; GCN-NEXT: v_mul_f32_e32 v5, s4, v7 | |||||
; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 | ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 | ||||
; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 | |||||
; GCN-NEXT: v_mul_hi_u32 v6, v4, v2 | ; GCN-NEXT: v_mul_hi_u32 v6, v4, v2 | ||||
; GCN-NEXT: v_mul_lo_u32 v7, v4, v2 | ; GCN-NEXT: v_mul_lo_u32 v7, v4, v2 | ||||
; GCN-NEXT: v_mul_hi_u32 v10, v5, v3 | ; GCN-NEXT: v_mul_hi_u32 v10, v5, v3 | ||||
; GCN-NEXT: v_mul_lo_u32 v11, v5, v3 | ; GCN-NEXT: v_mul_lo_u32 v11, v5, v3 | ||||
; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v7 | |||||
; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 | |||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 | ||||
; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v7 | |||||
; GCN-NEXT: v_cndmask_b32_e64 v6, v7, v12, s[0:1] | ; GCN-NEXT: v_cndmask_b32_e64 v6, v7, v12, s[0:1] | ||||
; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 | |||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v10 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v10 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v7, v11, v13, s[2:3] | ; GCN-NEXT: v_cndmask_b32_e64 v7, v11, v13, s[2:3] | ||||
; GCN-NEXT: v_mul_hi_u32 v6, v6, v4 | ; GCN-NEXT: v_mul_hi_u32 v6, v6, v4 | ||||
; GCN-NEXT: v_mul_hi_u32 v7, v7, v5 | ; GCN-NEXT: v_mul_hi_u32 v7, v7, v5 | ||||
; GCN-NEXT: v_add_i32_e32 v10, vcc, v6, v4 | ; GCN-NEXT: v_add_i32_e32 v10, vcc, v6, v4 | ||||
; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v6, v4 | ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v6, v4 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] | |||||
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v5 | ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v5 | ||||
; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v7, v5 | ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v7, v5 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] | |||||
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[2:3] | |||||
; GCN-NEXT: v_mul_hi_u32 v4, v4, v0 | ; GCN-NEXT: v_mul_hi_u32 v4, v4, v0 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[2:3] | |||||
; GCN-NEXT: v_mul_hi_u32 v5, v5, v1 | ; GCN-NEXT: v_mul_hi_u32 v5, v5, v1 | ||||
; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 | ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 | ||||
; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 | ; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 | ||||
; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v4 | |||||
; GCN-NEXT: v_mul_lo_u32 v11, v5, v3 | ; GCN-NEXT: v_mul_lo_u32 v11, v5, v3 | ||||
; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v5 | ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v4 | ||||
; GCN-NEXT: v_add_i32_e32 v13, vcc, -1, v5 | |||||
; GCN-NEXT: v_subrev_i32_e32 v14, vcc, v6, v0 | ; GCN-NEXT: v_subrev_i32_e32 v14, vcc, v6, v0 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v6 | ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v6 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2 | |||||
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v11, v1 | ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v11, v1 | ||||
; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v5 | |||||
; GCN-NEXT: v_add_i32_e32 v13, vcc, -1, v5 | |||||
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 | ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2 | |||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 | ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 | ||||
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] | ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] | ||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[2:3] | ; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[2:3] | ||||
; GCN-NEXT: s_and_b64 s[2:3], s[4:5], vcc | ; GCN-NEXT: s_and_b64 s[2:3], s[4:5], vcc | ||||
; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v12, s[2:3] | ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v12, s[2:3] | ||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[0:1] | ; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[0:1] | ||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc | ; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc | ||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 | ||||
▲ Show 20 Lines • Show All 247 Lines • ▼ Show 20 Lines | ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) | ||||
%result = sdiv <2 x i32> %num, %den | %result = sdiv <2 x i32> %num, %den | ||||
store <2 x i32> %result, <2 x i32> addrspace(1)* %out | store <2 x i32> %result, <2 x i32> addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { | define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { | ||||
; GCN-LABEL: sdiv_v2i32_4: | ; GCN-LABEL: sdiv_v2i32_4: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | |||||
; GCN-NEXT: s_mov_b32 s11, s7 | |||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s0, s4 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 | ; GCN-NEXT: s_mov_b32 s4, s6 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s5, s7 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: s_mov_b32 s6, s2 | ||||
; GCN-NEXT: s_mov_b32 s7, s3 | |||||
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 | |||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 | ; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 | ; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 | ||||
; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 | ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 | ||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: sdiv_v2i32_4: | ; TONGA-LABEL: sdiv_v2i32_4: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s2, -1 | ; TONGA-NEXT: s_mov_b32 s2, -1 | ||||
; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ||||
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines | ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) | ||||
%result = sdiv <2 x i32> %num, <i32 4, i32 4> | %result = sdiv <2 x i32> %num, <i32 4, i32 4> | ||||
store <2 x i32> %result, <2 x i32> addrspace(1)* %out | store <2 x i32> %result, <2 x i32> addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { | define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { | ||||
; GCN-LABEL: sdiv_v4i32: | ; GCN-LABEL: sdiv_v4i32: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s19, 0xf000 | ; GCN-NEXT: s_mov_b32 s11, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s18, -1 | ; GCN-NEXT: s_mov_b32 s10, -1 | ||||
; GCN-NEXT: s_mov_b32 s2, s18 | ; GCN-NEXT: s_mov_b32 s2, s10 | ||||
; GCN-NEXT: s_mov_b32 s3, s19 | ; GCN-NEXT: s_mov_b32 s3, s11 | ||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s0, s10 | ; GCN-NEXT: s_mov_b32 s0, s14 | ||||
; GCN-NEXT: s_mov_b32 s1, s11 | ; GCN-NEXT: s_mov_b32 s1, s15 | ||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 | ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 | ||||
; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 | ; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 | ||||
; GCN-NEXT: s_mov_b32 s6, 0x4f800000 | ; GCN-NEXT: s_mov_b32 s14, 0x4f800000 | ||||
; GCN-NEXT: s_mov_b32 s8, s12 | |||||
; GCN-NEXT: s_mov_b32 s9, s13 | |||||
; GCN-NEXT: s_waitcnt vmcnt(1) | ; GCN-NEXT: s_waitcnt vmcnt(1) | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 | ||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 | ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v3 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v7 | |||||
; GCN-NEXT: v_xor_b32_e32 v16, v8, v9 | |||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 | |||||
; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 | |||||
; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 | |||||
; GCN-NEXT: v_add_i32_e32 v3, vcc, v14, v3 | |||||
; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 | ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 | ||||
; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 | |||||
; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 | |||||
; GCN-NEXT: v_add_i32_e32 v7, vcc, v15, v7 | |||||
; GCN-NEXT: v_xor_b32_e32 v17, v10, v11 | |||||
; GCN-NEXT: v_xor_b32_e32 v18, v12, v13 | |||||
; GCN-NEXT: v_xor_b32_e32 v19, v14, v15 | |||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 | |||||
; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 | ; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 | ||||
; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 | ; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v9, v4 | |||||
; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 | |||||
; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 | |||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 | |||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9 | |||||
; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 | ; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 | ||||
; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 | ||||
; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 | ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v5 | ||||
; GCN-NEXT: v_xor_b32_e32 v3, v3, v14 | ; GCN-NEXT: v_mul_f32_e32 v9, s14, v9 | ||||
; GCN-NEXT: v_xor_b32_e32 v7, v7, v15 | ; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 | ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v9, v5 | |||||
; GCN-NEXT: v_cvt_f32_u32_e32 v10, v6 | |||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9 | ; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 | ; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 | ||||
; GCN-NEXT: v_mul_f32_e32 v8, s6, v8 | ; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 | ||||
; GCN-NEXT: v_mul_f32_e32 v9, s6, v9 | ; GCN-NEXT: v_mul_f32_e32 v8, s14, v8 | ||||
; GCN-NEXT: v_mul_f32_e32 v10, s6, v10 | ; GCN-NEXT: v_mul_hi_u32 v11, v9, v4 | ||||
; GCN-NEXT: v_mul_lo_u32 v10, v9, v4 | |||||
; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 | ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 | ||||
; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 | ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 | ||||
; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 | ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 | ||||
; GCN-NEXT: v_mul_hi_u32 v11, v8, v4 | ; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 | ||||
; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 | |||||
; GCN-NEXT: v_mul_hi_u32 v13, v9, v5 | |||||
; GCN-NEXT: v_mul_lo_u32 v14, v9, v5 | |||||
; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 | |||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 | ||||
; GCN-NEXT: v_mul_hi_u32 v11, v10, v6 | ; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[0:1] | ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 | ||||
; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v14 | ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v10 | ||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v13 | ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1] | ||||
; GCN-NEXT: v_mul_lo_u32 v13, v10, v6 | ; GCN-NEXT: v_mul_hi_u32 v12, v8, v5 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[2:3] | ; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 | ||||
; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 | ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 | ||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 | ; GCN-NEXT: v_mul_lo_u32 v11, v8, v5 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v11, v7 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v12 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 | ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v6 | ||||
; GCN-NEXT: v_mul_f32_e32 v11, s6, v11 | ; GCN-NEXT: v_mul_hi_u32 v10, v10, v9 | ||||
; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11 | ; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] | ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v13, s[2:3] | ||||
; GCN-NEXT: v_mul_hi_u32 v15, v11, v7 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 | ||||
; GCN-NEXT: v_mul_lo_u32 v20, v11, v7 | ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 | ||||
; GCN-NEXT: v_sub_i32_e32 v21, vcc, 0, v20 | ; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 | ||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 | ; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v15, v20, v21, s[6:7] | ; GCN-NEXT: v_mul_f32_e32 v12, s14, v12 | ||||
; GCN-NEXT: v_mul_hi_u32 v12, v12, v8 | ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 | ||||
; GCN-NEXT: v_add_i32_e32 v20, vcc, v12, v8 | ; GCN-NEXT: v_mul_hi_u32 v18, v12, v6 | ||||
; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v12, v8 | ; GCN-NEXT: v_mul_lo_u32 v13, v12, v6 | ||||
; GCN-NEXT: v_mul_hi_u32 v12, v14, v9 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 | ||||
; GCN-NEXT: v_add_i32_e32 v14, vcc, v12, v9 | ; GCN-NEXT: v_add_i32_e32 v18, vcc, v10, v9 | ||||
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, v12, v9 | ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, v10, v9 | ||||
; GCN-NEXT: v_mul_hi_u32 v12, v13, v10 | ; GCN-NEXT: v_mul_hi_u32 v10, v11, v8 | ||||
; GCN-NEXT: v_add_i32_e32 v13, vcc, v12, v10 | ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v18, s[0:1] | ||||
; GCN-NEXT: v_subrev_i32_e32 v10, vcc, v12, v10 | ; GCN-NEXT: v_mul_hi_u32 v9, v9, v0 | ||||
; GCN-NEXT: v_mul_hi_u32 v12, v15, v11 | ; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v13 | ||||
; GCN-NEXT: v_add_i32_e32 v15, vcc, v12, v11 | ; GCN-NEXT: v_add_i32_e32 v11, vcc, v10, v8 | ||||
; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v12, v11 | ; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v10, v8 | ||||
; GCN-NEXT: s_mov_b32 s16, s8 | ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[4:5] | ||||
; GCN-NEXT: s_mov_b32 s17, s9 | ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[2:3] | ||||
; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v20, s[0:1] | ; GCN-NEXT: v_mul_hi_u32 v10, v13, v12 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[2:3] | ; GCN-NEXT: v_mul_lo_u32 v11, v9, v4 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[4:5] | ; GCN-NEXT: v_mul_hi_u32 v8, v8, v1 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[6:7] | ; GCN-NEXT: v_add_i32_e32 v13, vcc, v10, v12 | ||||
; GCN-NEXT: v_mul_hi_u32 v8, v8, v0 | ; GCN-NEXT: v_subrev_i32_e32 v10, vcc, v10, v12 | ||||
; GCN-NEXT: v_mul_hi_u32 v9, v9, v1 | ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v11 | ||||
; GCN-NEXT: v_mul_hi_u32 v10, v10, v2 | ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 | ||||
; GCN-NEXT: v_mul_hi_u32 v11, v11, v3 | |||||
; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 | |||||
; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v8 | |||||
; GCN-NEXT: v_add_i32_e32 v14, vcc, -1, v8 | |||||
; GCN-NEXT: v_mul_lo_u32 v15, v9, v5 | |||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v12 | |||||
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 | |||||
; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v9 | |||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 | ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, -1, v9 | ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[4:5] | ||||
; GCN-NEXT: v_mul_lo_u32 v4, v10, v6 | ; GCN-NEXT: v_mul_lo_u32 v0, v8, v5 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v15 | ; GCN-NEXT: v_mul_hi_u32 v4, v10, v2 | ||||
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v15 | ; GCN-NEXT: v_add_i32_e32 v12, vcc, -1, v9 | ||||
; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v10 | ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v8 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 | ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v0 | ||||
; GCN-NEXT: v_add_i32_e32 v1, vcc, -1, v10 | ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 | ||||
; GCN-NEXT: v_mul_lo_u32 v5, v11, v7 | ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v5 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 | ; GCN-NEXT: v_mul_lo_u32 v5, v4, v6 | ||||
; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 | ; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v9 | ||||
; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v11 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v8 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[10:11], v3, v5 | ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] | ||||
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 | ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc | ||||
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v11 | ; GCN-NEXT: v_sub_i32_e32 v9, vcc, v2, v5 | ||||
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 | ; GCN-NEXT: s_and_b64 vcc, s[6:7], s[4:5] | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[12:13], v3, v7 | ; GCN-NEXT: v_cvt_f32_u32_e32 v11, v7 | ||||
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] | ; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc | ||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v8, v13, s[2:3] | ; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[0:1] | ||||
; GCN-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5] | ; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[4:5] | ||||
; GCN-NEXT: v_cndmask_b32_e64 v3, v9, v12, s[2:3] | ; GCN-NEXT: v_xor_b32_e32 v1, v1, v15 | ||||
; GCN-NEXT: s_and_b64 vcc, vcc, s[8:9] | ; GCN-NEXT: v_xor_b32_e32 v8, v0, v16 | ||||
; GCN-NEXT: v_cndmask_b32_e32 v6, v10, v15, vcc | ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 | ||||
; GCN-NEXT: s_and_b64 vcc, s[12:13], s[10:11] | ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v8, v16 | ||||
; GCN-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc | ; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v11 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v14, v2, s[0:1] | ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v9, v6 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] | ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v2, v5 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] | ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v3 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[10:11] | ; GCN-NEXT: v_mul_f32_e32 v8, s14, v8 | ||||
; GCN-NEXT: v_xor_b32_e32 v2, v2, v16 | ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 | ||||
; GCN-NEXT: v_xor_b32_e32 v4, v0, v17 | ; GCN-NEXT: v_add_i32_e32 v3, vcc, v10, v3 | ||||
; GCN-NEXT: v_xor_b32_e32 v5, v1, v18 | ; GCN-NEXT: v_xor_b32_e32 v3, v3, v10 | ||||
; GCN-NEXT: v_xor_b32_e32 v3, v3, v19 | ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v4 | ||||
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v16 | ; GCN-NEXT: v_mul_lo_u32 v5, v8, v7 | ||||
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v4, v17 | ; GCN-NEXT: v_mul_hi_u32 v9, v8, v7 | ||||
; GCN-NEXT: v_sub_i32_e32 v2, vcc, v5, v18 | ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v4 | ||||
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v19 | ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 | ||||
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] | |||||
; GCN-NEXT: v_mul_hi_u32 v5, v5, v8 | |||||
; GCN-NEXT: v_add_i32_e32 v9, vcc, v5, v8 | |||||
; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v5, v8 | |||||
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] | |||||
; GCN-NEXT: v_mul_hi_u32 v5, v5, v3 | |||||
; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] | |||||
; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc | |||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] | |||||
; GCN-NEXT: v_mul_lo_u32 v4, v5, v7 | |||||
; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 | |||||
; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 | |||||
; GCN-NEXT: v_xor_b32_e32 v6, v10, v14 | |||||
; GCN-NEXT: v_sub_i32_e32 v8, vcc, v3, v4 | |||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v8, v7 | |||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v3, v4 | |||||
; GCN-NEXT: v_add_i32_e32 v7, vcc, -1, v5 | |||||
; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v5 | |||||
; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] | |||||
; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc | |||||
; GCN-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] | |||||
; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 | |||||
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 | |||||
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 | |||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: sdiv_v4i32: | ; TONGA-LABEL: sdiv_v4i32: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s11, 0xf000 | ; TONGA-NEXT: s_mov_b32 s11, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s10, -1 | ; TONGA-NEXT: s_mov_b32 s10, -1 | ||||
; TONGA-NEXT: s_mov_b32 s2, s10 | ; TONGA-NEXT: s_mov_b32 s2, s10 | ||||
▲ Show 20 Lines • Show All 429 Lines • ▼ Show 20 Lines | ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) | ||||
%result = sdiv <4 x i32> %num, %den | %result = sdiv <4 x i32> %num, %den | ||||
store <4 x i32> %result, <4 x i32> addrspace(1)* %out | store <4 x i32> %result, <4 x i32> addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { | define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { | ||||
; GCN-LABEL: sdiv_v4i32_4: | ; GCN-LABEL: sdiv_v4i32_4: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | |||||
; GCN-NEXT: s_mov_b32 s11, s7 | |||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s0, s4 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 | ; GCN-NEXT: s_mov_b32 s4, s6 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s5, s7 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: s_mov_b32 s6, s2 | ||||
; GCN-NEXT: s_mov_b32 s7, s3 | |||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 | |||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v2 | ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v2 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 | ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v4, 30, v4 | ; GCN-NEXT: v_lshrrev_b32_e32 v4, 30, v4 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 | ; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6 | ; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7 | ; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7 | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 | ||||
; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1 | ; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1 | ||||
; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 | ; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 | ||||
; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 | ; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 | ; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3 | ; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3 | ||||
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: sdiv_v4i32_4: | ; TONGA-LABEL: sdiv_v4i32_4: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s2, -1 | ; TONGA-NEXT: s_mov_b32 s2, -1 | ||||
; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ||||
▲ Show 20 Lines • Show All 99 Lines • ▼ Show 20 Lines | ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) | ||||
%result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4> | %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4> | ||||
store <4 x i32> %result, <4 x i32> addrspace(1)* %out | store <4 x i32> %result, <4 x i32> addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { | define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { | ||||
; GCN-LABEL: v_sdiv_i8: | ; GCN-LABEL: v_sdiv_i8: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | ; GCN-NEXT: s_mov_b32 s10, s2 | ||||
; GCN-NEXT: s_mov_b32 s11, s7 | ; GCN-NEXT: s_mov_b32 s11, s3 | ||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s8, s6 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s9, s7 | ||||
; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 | ; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 | ||||
; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 | ; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s0, s4 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: s_waitcnt vmcnt(1) | |||||
; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0 | |||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_xor_b32_e32 v2, v0, v1 | ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1 | ||||
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 | ||||
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 | ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 | ||||
; GCN-NEXT: v_or_b32_e32 v2, 1, v2 | ; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 | ||||
; GCN-NEXT: v_mul_f32_e32 v3, v0, v3 | ; GCN-NEXT: v_trunc_f32_e32 v1, v1 | ||||
; GCN-NEXT: v_trunc_f32_e32 v3, v3 | ; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 | ||||
; GCN-NEXT: v_mad_f32 v0, -v3, v1, v0 | ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 | ||||
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 | ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| | ||||
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| | ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc | ||||
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 | |||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 | ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 | ||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: v_sdiv_i8: | ; TONGA-LABEL: v_sdiv_i8: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s2, -1 | ; TONGA-NEXT: s_mov_b32 s2, -1 | ||||
; TONGA-NEXT: s_mov_b32 s10, s2 | ; TONGA-NEXT: s_mov_b32 s10, s2 | ||||
▲ Show 20 Lines • Show All 100 Lines • ▼ Show 20 Lines | ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) | ||||
%result.ext = sext i8 %result to i32 | %result.ext = sext i8 %result to i32 | ||||
store i32 %result.ext, i32 addrspace(1)* %out | store i32 %result.ext, i32 addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { | define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { | ||||
; GCN-LABEL: v_sdiv_i23: | ; GCN-LABEL: v_sdiv_i23: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | |||||
; GCN-NEXT: s_mov_b32 s11, s7 | |||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s0, s4 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 | ; GCN-NEXT: s_mov_b32 s4, s6 | ||||
; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 | ; GCN-NEXT: s_mov_b32 s5, s7 | ||||
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 | ; GCN-NEXT: s_mov_b32 s6, s2 | ||||
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 | ; GCN-NEXT: s_mov_b32 s7, s3 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2 | ||||
; GCN-NEXT: s_waitcnt vmcnt(3) | ; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 | ||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 | ; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6 | ||||
; GCN-NEXT: s_waitcnt vmcnt(2) | ; GCN-NEXT: s_waitcnt vmcnt(2) | ||||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 | ||||
; GCN-NEXT: s_waitcnt vmcnt(1) | |||||
; GCN-NEXT: v_or_b32_e32 v0, v0, v1 | ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 | ||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_or_b32_e32 v1, v2, v3 | ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | ||||
; GCN-NEXT: v_or_b32_e32 v2, v2, v3 | |||||
; GCN-NEXT: v_bfe_i32 v2, v2, 0, 23 | |||||
; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 | |||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 | ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 | ||||
; GCN-NEXT: v_bfe_i32 v1, v1, 0, 23 | ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 | ||||
; GCN-NEXT: v_xor_b32_e32 v2, v0, v1 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 | ||||
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3 | ||||
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 | ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 | ; GCN-NEXT: v_mul_f32_e32 v2, v1, v4 | ||||
; GCN-NEXT: v_or_b32_e32 v2, 1, v2 | ; GCN-NEXT: v_trunc_f32_e32 v2, v2 | ||||
; GCN-NEXT: v_mul_f32_e32 v3, v0, v3 | ; GCN-NEXT: v_mad_f32 v1, -v2, v3, v1 | ||||
; GCN-NEXT: v_trunc_f32_e32 v3, v3 | ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 | ||||
; GCN-NEXT: v_mad_f32 v0, -v3, v1, v0 | ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| | ||||
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 | ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc | ||||
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 | ||||
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc | |||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 | |||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 | ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 | ||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: v_sdiv_i23: | ; TONGA-LABEL: v_sdiv_i23: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s2, -1 | ; TONGA-NEXT: s_mov_b32 s2, -1 | ||||
; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ||||
▲ Show 20 Lines • Show All 130 Lines • ▼ Show 20 Lines | ; EG-NEXT: 9(1.261169e-44), 2(2.802597e-45) | ||||
%result.ext = sext i23 %result to i32 | %result.ext = sext i23 %result to i32 | ||||
store i32 %result.ext, i32 addrspace(1)* %out | store i32 %result.ext, i32 addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { | define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { | ||||
; GCN-LABEL: v_sdiv_i24: | ; GCN-LABEL: v_sdiv_i24: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | |||||
; GCN-NEXT: s_mov_b32 s11, s7 | |||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s0, s4 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2 | ; GCN-NEXT: s_mov_b32 s4, s6 | ||||
; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6 | ; GCN-NEXT: s_mov_b32 s5, s7 | ||||
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 | ; GCN-NEXT: s_mov_b32 s6, s2 | ||||
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 | ; GCN-NEXT: s_mov_b32 s7, s3 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 | ||||
; GCN-NEXT: s_waitcnt vmcnt(3) | ; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 | ||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 | ; GCN-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 | ||||
; GCN-NEXT: s_waitcnt vmcnt(2) | ; GCN-NEXT: s_waitcnt vmcnt(2) | ||||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 | ||||
; GCN-NEXT: s_waitcnt vmcnt(1) | |||||
; GCN-NEXT: v_or_b32_e32 v0, v0, v1 | ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 | ||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | |||||
; GCN-NEXT: v_or_b32_e32 v2, v2, v3 | ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 | ||||
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 | |||||
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 | |||||
; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 | ; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v1 | ; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v1 | ||||
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 | ||||
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 | |||||
; GCN-NEXT: v_or_b32_e32 v1, 1, v1 | ; GCN-NEXT: v_or_b32_e32 v1, 1, v1 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 | ; GCN-NEXT: v_mul_f32_e32 v3, v0, v4 | ||||
; GCN-NEXT: v_mul_f32_e32 v3, v0, v3 | |||||
; GCN-NEXT: v_trunc_f32_e32 v3, v3 | ; GCN-NEXT: v_trunc_f32_e32 v3, v3 | ||||
; GCN-NEXT: v_mad_f32 v0, -v3, v2, v0 | ; GCN-NEXT: v_mad_f32 v0, -v3, v2, v0 | ||||
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 | ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 | ||||
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| | ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| | ||||
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc | ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc | ||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 | ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 | ||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 | ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 | ||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: v_sdiv_i24: | ; TONGA-LABEL: v_sdiv_i24: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ; TONGA-NEXT: s_mov_b32 s3, 0xf000 | ||||
; TONGA-NEXT: s_mov_b32 s2, -1 | ; TONGA-NEXT: s_mov_b32 s2, -1 | ||||
; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ; TONGA-NEXT: s_waitcnt lgkmcnt(0) | ||||
▲ Show 20 Lines • Show All 136 Lines • ▼ Show 20 Lines | ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) | ||||
%result.ext = sext i24 %result to i32 | %result.ext = sext i24 %result to i32 | ||||
store i32 %result.ext, i32 addrspace(1)* %out | store i32 %result.ext, i32 addrspace(1)* %out | ||||
ret void | ret void | ||||
} | } | ||||
define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { | define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { | ||||
; GCN-LABEL: v_sdiv_i25: | ; GCN-LABEL: v_sdiv_i25: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s7, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s6, -1 | ||||
; GCN-NEXT: s_mov_b32 s10, s6 | ; GCN-NEXT: s_mov_b32 s2, s6 | ||||
; GCN-NEXT: s_mov_b32 s11, s7 | ; GCN-NEXT: s_mov_b32 s3, s7 | ||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s8, s2 | ; GCN-NEXT: s_mov_b32 s0, s10 | ||||
; GCN-NEXT: s_mov_b32 s9, s3 | ; GCN-NEXT: s_mov_b32 s1, s11 | ||||
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 | ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s4, s8 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: s_mov_b32 s5, s9 | ||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_bfe_i32 v2, v0, 0, 25 | ; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25 | ||||
; GCN-NEXT: v_bfe_i32 v3, v1, 0, 25 | |||||
; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 | |||||
; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 | ; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 | ||||
; GCN-NEXT: v_xor_b32_e32 v4, v0, v1 | ; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v2 | ||||
; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2 | ; GCN-NEXT: v_xor_b32_e32 v2, v2, v1 | ||||
; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v3 | ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 | ||||
; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 | ; GCN-NEXT: v_bfe_i32 v4, v0, 0, 25 | ||||
; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 | ; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 | ||||
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 | ; GCN-NEXT: v_add_i32_e32 v4, vcc, v0, v4 | ||||
; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 | ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 | ||||
; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 | ; GCN-NEXT: v_xor_b32_e32 v4, v4, v0 | ||||
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 | ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 | ||||
; GCN-NEXT: v_mul_hi_u32 v3, v2, v1 | ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 | ||||
; GCN-NEXT: v_mul_lo_u32 v5, v2, v1 | ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 | ||||
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 | ; GCN-NEXT: v_mul_lo_u32 v5, v3, v2 | ||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 | ; GCN-NEXT: v_mul_hi_u32 v6, v3, v2 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] | ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 | ||||
; GCN-NEXT: v_mul_hi_u32 v3, v3, v2 | ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 | ||||
; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v2 | ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] | ||||
; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 | ; GCN-NEXT: v_mul_hi_u32 v5, v5, v3 | ||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] | ; GCN-NEXT: v_add_i32_e32 v6, vcc, v5, v3 | ||||
; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 | ; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v5, v3 | ||||
; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 | ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] | ||||
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 | ; GCN-NEXT: v_mul_hi_u32 v3, v3, v4 | ||||
; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v2 | ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 | ||||
; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v0 | ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 | ||||
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 | ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v3 | ||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1 | ; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v1, v4 | ||||
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 | |||||
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v2 | |||||
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc | ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc | ||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v5, s[0:1] | ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[0:1] | ||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc | ; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc | ||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 | ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 | ||||
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 | ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 | ||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 | ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 | ||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 | ||||
; GCN-NEXT: s_endpgm | ; GCN-NEXT: s_endpgm | ||||
; | ; | ||||
; TONGA-LABEL: v_sdiv_i25: | ; TONGA-LABEL: v_sdiv_i25: | ||||
; TONGA: ; %bb.0: | ; TONGA: ; %bb.0: | ||||
; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 | ; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 | ||||
; TONGA-NEXT: s_mov_b32 s7, 0xf000 | ; TONGA-NEXT: s_mov_b32 s7, 0xf000 | ||||
▲ Show 20 Lines • Show All 182 Lines • ▼ Show 20 Lines | |||||
; %result = add i64 %resultdiv, %resultrem | ; %result = add i64 %resultdiv, %resultrem | ||||
; store i64 %result, i64 addrspace(1)* %out, align 8 | ; store i64 %result, i64 addrspace(1)* %out, align 8 | ||||
; ret void | ; ret void | ||||
; } | ; } | ||||
define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { | define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { | ||||
; GCN-LABEL: scalarize_mulhs_4xi32: | ; GCN-LABEL: scalarize_mulhs_4xi32: | ||||
; GCN: ; %bb.0: | ; GCN: ; %bb.0: | ||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||
; GCN-NEXT: s_mov_b32 s7, 0xf000 | ; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||||
; GCN-NEXT: s_mov_b32 s6, -1 | ; GCN-NEXT: s_mov_b32 s2, -1 | ||||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||||
; GCN-NEXT: s_mov_b32 s4, s0 | ; GCN-NEXT: s_mov_b32 s0, s4 | ||||
; GCN-NEXT: s_mov_b32 s5, s1 | ; GCN-NEXT: s_mov_b32 s1, s5 | ||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 | ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 | ||||
; GCN-NEXT: s_mov_b32 s0, 0x1389c755 | ; GCN-NEXT: s_mov_b32 s0, 0x1389c755 | ||||
; GCN-NEXT: s_mov_b32 s4, s2 | ; GCN-NEXT: s_mov_b32 s4, s6 | ||||
; GCN-NEXT: s_mov_b32 s5, s3 | ; GCN-NEXT: s_mov_b32 s5, s7 | ||||
; GCN-NEXT: s_mov_b32 s6, s2 | |||||
; GCN-NEXT: s_mov_b32 s7, s3 | |||||
; GCN-NEXT: s_waitcnt vmcnt(0) | ; GCN-NEXT: s_waitcnt vmcnt(0) | ||||
; GCN-NEXT: v_mul_hi_i32 v0, v0, s0 | ; GCN-NEXT: v_mul_hi_i32 v0, v0, s0 | ||||
; GCN-NEXT: v_mul_hi_i32 v1, v1, s0 | ; GCN-NEXT: v_mul_hi_i32 v1, v1, s0 | ||||
; GCN-NEXT: v_mul_hi_i32 v2, v2, s0 | ; GCN-NEXT: v_mul_hi_i32 v2, v2, s0 | ||||
; GCN-NEXT: v_mul_hi_i32 v3, v3, s0 | ; GCN-NEXT: v_mul_hi_i32 v3, v3, s0 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0 | ; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0 | ||||
; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 | ; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 | ||||
; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1 | ; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1 | ||||
▲ Show 20 Lines • Show All 124 Lines • Show Last 20 Lines |