Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1,61 +1,215 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI %s -; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg: -; GCN: v_bfe_u32 define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { +; SI-LABEL: bfe_u32_arg_arg_arg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_bfe_u32 v0, v0, s3, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_arg_arg_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_bfe_u32 v0, v0, s1, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_arg_arg_imm: -; GCN: v_bfe_u32 define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; SI-LABEL: bfe_u32_arg_arg_imm: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7b +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfe_u32 v0, s2, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_arg_arg_imm: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v1, 0x7b +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_arg_imm_arg: -; GCN: v_bfe_u32 define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { +; SI-LABEL: bfe_u32_arg_imm_arg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7b +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_arg_imm_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_imm_arg_arg: -; GCN: v_bfe_u32 define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { +; SI-LABEL: bfe_u32_imm_arg_arg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_movk_i32 s0, 0x7b +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfe_u32 v0, s0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_imm_arg_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_movk_i32 s2, 0x7b +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; SI-LABEL: bfe_u32_arg_0_width_reg_offset: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_arg_0_width_reg_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; SI-LABEL: bfe_u32_arg_0_width_imm_offset: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_arg_0_width_imm_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_zextload_i8: -; GCN: buffer_load_ubyte -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_zextload_i8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_zextload_i8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %in %ext = zext i8 %load to i32 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) @@ -63,14 +217,45 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8: -; GCN: buffer_load_dword -; GCN: v_add_{{[iu]}}32 -; GCN-NEXT: v_and_b32_e32 ; FIXME: Should be using s_add_i32 -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_zext_in_reg_i8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_zext_in_reg_i8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -79,13 +264,44 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16: -; GCN: buffer_load_dword -; GCN: v_add_{{[iu]}}32 -; GCN-NEXT: v_and_b32_e32 -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_zext_in_reg_i16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_zext_in_reg_i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 65535 @@ -94,12 +310,46 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1: -; GCN: buffer_load_dword -; GCN: v_add_{{[iu]}}32 -; GCN: bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xfe, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 1, 8 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xfe, v0 +; VI-NEXT: v_bfe_u32 v0, v0, 1, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -108,13 +358,46 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3: -; GCN: buffer_load_dword -; GCN: v_add_{{[iu]}}32 -; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 -; GCN-NEXT: bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xf8, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 3, 8 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xf8, v0 +; VI-NEXT: v_bfe_u32 v0, v0, 3, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -123,13 +406,46 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7: -; GCN: buffer_load_dword -; GCN: v_add_{{[iu]}}32 -; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 -; GCN-NEXT: bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x80, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 7, 8 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x80, v0 +; VI-NEXT: v_bfe_u32 v0, v0, 7, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -138,12 +454,44 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8: -; GCN: buffer_load_dword -; GCN: v_add_{{[iu]}}32 -; GCN-NEXT: bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 65535 @@ -152,11 +500,42 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_test_1: -; GCN: buffer_load_dword -; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -164,6 +543,25 @@ } define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8) @@ -172,6 +570,25 @@ } define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_3: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1) @@ -179,14 +596,26 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_test_4: -; GCN-NOT: lshl -; GCN-NOT: shr -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_4: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_4: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %shr = lshr i32 %shl, 31 @@ -195,13 +624,42 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_test_5: -; GCN: buffer_load_dword -; GCN-NOT: lshl -; GCN-NOT: shr -; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_5: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_5: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %shr = ashr i32 %shl, 31 @@ -210,11 +668,44 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_test_6: -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_6: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_6: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31) @@ -222,11 +713,42 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_test_7: -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_7: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_7: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31) @@ -234,12 +756,42 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_test_8: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) @@ -247,299 +799,732 @@ ret void } -; GCN-LABEL: {{^}}bfe_u32_test_9: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_9: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_9: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_test_10: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_10: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_10: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_test_11: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_11: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_11: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_test_12: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_12: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_12: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_test_13: ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_13: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_13: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_test_14: -; GCN-NOT: lshr -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; SI-LABEL: bfe_u32_test_14: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_test_14: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = lshr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_0: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_1: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_2: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_3: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_3: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_4: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_4: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_4: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_5: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_5: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_5: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_6: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_6: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x80 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_6: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0x80 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_7: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_7: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_7: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_8: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_9: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_9: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_9: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_10: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_10: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_10: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_11: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_11: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 10 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_11: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 10 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_12: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_12: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_12: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_13: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_13: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_13: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_14: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_14: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 40 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_14: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 40 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_15: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_15: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 10 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_15: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 10 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_16: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_17: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_17: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_17: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_18: -; GCN-NOT: {{[^@]}}bfe -; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; GCN: buffer_store_dword [[VREG]], -; GCN: s_endpgm ; EG-NOT: BFE define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { +; SI-LABEL: bfe_u32_constant_fold_test_18: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: bfe_u32_constant_fold_test_18: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -549,14 +1534,48 @@ ; reduced to the bits demanded by the bfe. ; XXX: The operand to v_bfe_u32 could also just directly be the load register. -; GCN-LABEL: {{^}}simplify_bfe_u32_multi_use_arg: -; GCN: buffer_load_dword [[ARG:v[0-9]+]] -; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]] -; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2 -; GCN-DAG: buffer_store_dword [[AND]] -; GCN-DAG: buffer_store_dword [[BFE]] -; GCN: s_endpgm define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, +; SI-LABEL: simplify_bfe_u32_multi_use_arg: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 63, v0 +; SI-NEXT: v_bfe_u32 v1, v0, 2, 2 +; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: simplify_bfe_u32_multi_use_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 63, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %src = load i32, i32 addrspace(1)* %in, align 4 @@ -567,50 +1586,150 @@ ret void } -; GCN-LABEL: {{^}}lshr_and: -; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; GCN: buffer_store_dword define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { +; SI-LABEL: lshr_and: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: lshr_and: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %b = lshr i32 %a, 6 %c = and i32 %b, 7 store i32 %c, i32 addrspace(1)* %out, align 8 ret void } -; GCN-LABEL: {{^}}v_lshr_and: -; GCN: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 -; GCN: buffer_store_dword define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; SI-LABEL: v_lshr_and: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_bfe_u32 v0, s2, v0, 3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_lshr_and: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_bfe_u32 v0, s0, v0, 3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %c = lshr i32 %a, %b %d = and i32 %c, 7 store i32 %d, i32 addrspace(1)* %out, align 8 ret void } -; GCN-LABEL: {{^}}and_lshr: -; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; GCN: buffer_store_dword define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { +; SI-LABEL: and_lshr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: and_lshr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %b = and i32 %a, 448 %c = lshr i32 %b, 6 store i32 %c, i32 addrspace(1)* %out, align 8 ret void } -; GCN-LABEL: {{^}}and_lshr2: -; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; GCN: buffer_store_dword define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { +; SI-LABEL: and_lshr2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: and_lshr2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %b = and i32 %a, 511 %c = lshr i32 %b, 6 store i32 %c, i32 addrspace(1)* %out, align 8 ret void } -; GCN-LABEL: {{^}}shl_lshr: -; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 -; GCN: buffer_store_dword define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { +; SI-LABEL: shl_lshr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s0, s2, 0x150002 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: shl_lshr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bfe_u32 s0, s0, 0x150002 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %b = shl i32 %a, 9 %c = lshr i32 %b, 11 store i32 %c, i32 addrspace(1)* %out, align 8 Index: llvm/trunk/utils/UpdateTestChecks/asm.py =================================================================== --- llvm/trunk/utils/UpdateTestChecks/asm.py +++ llvm/trunk/utils/UpdateTestChecks/asm.py @@ -1,3 +1,4 @@ +from __future__ import print_function import re import sys @@ -199,6 +200,15 @@ asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm) return asm +def get_triple_from_march(march): + triples = { + 'amdgcn': 'amdgcn', + } + for prefix, triple in triples.items(): + if march.startswith(prefix): + return triple + print("Cannot find a triple. Assume 'x86'", file=sys.stderr) + return 'x86' def build_function_body_dictionary_for_triple(args, raw_tool_output, triple, prefixes, func_dict): target_handlers = { Index: llvm/trunk/utils/update_llc_test_checks.py =================================================================== --- llvm/trunk/utils/update_llc_test_checks.py +++ llvm/trunk/utils/update_llc_test_checks.py @@ -81,6 +81,11 @@ if m: triple_in_cmd = m.groups()[0] + march_in_cmd = None + m = common.MARCH_ARG_RE.search(llc_cmd) + if m: + march_in_cmd = m.groups()[0] + filecheck_cmd = '' if len(commands) > 1: filecheck_cmd = commands[1] @@ -102,24 +107,25 @@ # FIXME: We should use multiple check prefixes to common check lines. For # now, we just ignore all but the last. - run_list.append((check_prefixes, llc_cmd_args, triple_in_cmd)) + run_list.append((check_prefixes, llc_cmd_args, triple_in_cmd, march_in_cmd)) func_dict = {} for p in run_list: prefixes = p[0] for prefix in prefixes: func_dict.update({prefix: dict()}) - for prefixes, llc_args, triple_in_cmd in run_list: + for prefixes, llc_args, triple_in_cmd, march_in_cmd in run_list: if args.verbose: print('Extracted LLC cmd: llc ' + llc_args, file=sys.stderr) print('Extracted FileCheck prefixes: ' + str(prefixes), file=sys.stderr) raw_tool_output = common.invoke_tool(args.llc_binary, llc_args, test) - if not (triple_in_cmd or triple_in_ir): - print("Cannot find a triple. Assume 'x86'", file=sys.stderr) + triple = triple_in_cmd or triple_in_ir + if not triple: + triple = asm.get_triple_from_march(march_in_cmd) asm.build_function_body_dictionary_for_triple(args, raw_tool_output, - triple_in_cmd or triple_in_ir or 'x86', prefixes, func_dict) + triple, prefixes, func_dict) is_in_function = False is_in_function_start = False