Index: test/CodeGen/AMDGPU/extract-lowbits.ll =================================================================== --- test/CodeGen/AMDGPU/extract-lowbits.ll +++ test/CodeGen/AMDGPU/extract-lowbits.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=AMDGPU %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=AMDGPU %s ; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll, ; but with all 64-bit tests, and tests with loads dropped. @@ -16,174 +16,224 @@ ; Pattern a. 32-bit ; ---------------------------------------------------------------------------- ; -define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_a0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_a0: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 [[BITS:v[0-9]+]], s[[NUM]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %onebit = shl i32 1, %numlowbits %mask = add nsw i32 %onebit, -1 %masked = and i32 %mask, %val - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } -define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind { -; GCN-LABEL: bzhi32_a1_indexzext: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_a1_indexzext: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: s_and_b32 [[ZEXT:s[0-9]+]], s[[NUM]] +; GCN: v_mov_b32_e32 [[BITS:v[0-9]+]], [[ZEXT]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) { %conv = zext i8 %numlowbits to i32 %onebit = shl i32 1, %conv %mask = add nsw i32 %onebit, -1 %masked = and i32 %mask, %val - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } -define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_a4_commutative: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_a4_commutative: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 [[BITS:v[0-9]+]], s[[NUM]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %onebit = shl i32 1, %numlowbits %mask = add nsw i32 %onebit, -1 %masked = and i32 %val, %mask ; swapped order - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } ; ---------------------------------------------------------------------------- ; ; Pattern b. 32-bit ; ---------------------------------------------------------------------------- ; -define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_b0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_b0: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 [[BITS:v[0-9]+]], s[[NUM]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %notmask = shl i32 -1, %numlowbits %mask = xor i32 %notmask, -1 %masked = and i32 %mask, %val - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } -define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind { -; GCN-LABEL: bzhi32_b1_indexzext: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_b1_indexzext: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: s_and_b32 [[ZEXT:s[0-9]+]], s[[NUM]] +; GCN: v_mov_b32_e32 [[BITS:v[0-9]+]], [[ZEXT]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) { %conv = zext i8 %numlowbits to i32 %notmask = shl i32 -1, %conv %mask = xor i32 %notmask, -1 %masked = and i32 %mask, %val - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } -define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_b4_commutative: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_b4_commutative: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 [[BITS:v[0-9]+]], s[[NUM]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %notmask = shl i32 -1, %numlowbits %mask = xor i32 %notmask, -1 %masked = and i32 %val, %mask ; swapped order - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } ; ---------------------------------------------------------------------------- ; ; Pattern c. 32-bit ; ---------------------------------------------------------------------------- ; -define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_c0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_c0: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 [[BITS:v[0-9]+]], s[[NUM]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i32 32, %numlowbits %mask = lshr i32 -1, %numhighbits %masked = and i32 %mask, %val - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } -define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind { -; SI-LABEL: bzhi32_c1_indexzext: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshr_b32_e32 v1, -1, v1 -; SI-NEXT: v_and_b32_e32 v0, v1, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bzhi32_c1_indexzext: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v1, 32, v1 -; VI-NEXT: v_mov_b32_e32 v2, -1 -; VI-NEXT: v_lshrrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_and_b32_e32 v0, v1, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_c1_indexzext: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[NUM]] +; GCN-NEXT: s_and_b32 [[ZEXT:s[0-9]+]], [[SUB]], 0xff +; GCN-NEXT: s_lshr_b32 [[MASK:s[0-9]+]], -1, [[ZEXT]] +; GCN-NEXT: s_and_b32 [[SRES:s[0-9]+]], [[MASK]], s[[VAL]] +; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], [[SRES]] +; SI: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i8 32, %numlowbits %sh_prom = zext i8 %numhighbits to i32 %mask = lshr i32 -1, %sh_prom %masked = and i32 %mask, %val - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } -define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_c4_commutative: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_c4_commutative: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 [[BITS:v[0-9]+]], s[[NUM]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i32 32, %numlowbits %mask = lshr i32 -1, %numhighbits %masked = and i32 %val, %mask ; swapped order - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } ; ---------------------------------------------------------------------------- ; ; Pattern d. 32-bit. ; ---------------------------------------------------------------------------- ; -define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_d0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_d0: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 [[BITS:v[0-9]+]], s[[NUM]] +; GCN-NEXT: v_bfe_u32 [[RES:v[0-9]*]], s[[VAL]], 0, [[BITS]] +; SI-NEXT: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i32 32, %numlowbits %highbitscleared = shl i32 %val, %numhighbits %masked = lshr i32 %highbitscleared, %numhighbits - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void } -define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind { -; SI-LABEL: bzhi32_d1_indexzext: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshl_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshr_b32_e32 v0, v0, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bzhi32_d1_indexzext: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v1, 32, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; AMDGPU-LABEL: bzhi32_d1_indexzext: +; SI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[VAL:[0-9]+]]:[[NUM:[0-9]+]]{{\]}}, s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; GCN: s_waitcnt +; GCN-NEXT: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[NUM]] +; GCN-NEXT: s_and_b32 [[ZEXT:s[0-9]+]], [[SUB]], 0xff +; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], s[[VAL]], [[ZEXT]] +; GCN-NEXT: s_lshr_b32 [[SHR:s[0-9]+]], [[SHL]], [[ZEXT]] +; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], [[SHR]] +; SI: buffer_store_dword [[RES]] +; VI: flat_store_dword {{.*}}, [[RES]] +define amdgpu_kernel void @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i8 32, %numlowbits %sh_prom = zext i8 %numhighbits to i32 %highbitscleared = shl i32 %val, %sh_prom %masked = lshr i32 %highbitscleared, %sh_prom - ret i32 %masked + store i32 %masked, i32 addrspace(1)* %out + ret void }