Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -278,8 +278,8 @@ def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; -def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; -def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; @@ -460,3 +460,11 @@ def AMDGPUmul_i24 : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_mul_i24 node:$src0, node:$src1), (AMDGPUmul_i24_impl node:$src0, node:$src1)]>; + +def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2), + (AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2), + (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>; Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -78,6 +78,9 @@ MachineRegisterInfo &MRI, int RSrcIdx) const; bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; + bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper, + bool Signed) const; + void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const; Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -160,8 +160,9 @@ if (!Op.isReg()) continue; + // We may see physical registers if building a real MI Register Reg = Op.getReg(); - if (MRI.getRegClassOrRegBank(Reg)) + if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) continue; const RegisterBank *RB = NewBank; @@ -1444,6 +1445,65 @@ return true; } +bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( + const OperandsMapper &OpdMapper, bool Signed) const { + MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + + // Insert basic copies + applyDefaultMapping(OpdMapper); + + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(DstReg); + + const LLT S32 = LLT::scalar(32); + + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::VGPRRegBank) { + if (Ty == S32) + return true; + + // TODO: 64-bit version is scalar only, so we need to expand this. + return false; + } + + Register SrcReg = MI.getOperand(2).getReg(); + Register OffsetReg = MI.getOperand(3).getReg(); + Register WidthReg = MI.getOperand(4).getReg(); + + // The scalar form packs the offset and width in a single operand. + + ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplyBank); + MachineIRBuilder B(MI); + B.setChangeObserver(Observer); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + // TODO: It might be worth using a pseudo here to avoid scc clobber and + // register class constraints. + unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : + (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) + llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + // FIXME: Duplicated from LegalizerHelper static CmpInst::Predicate minMaxToCompare(unsigned Opc) { switch (Opc) { @@ -2592,8 +2652,12 @@ constrainOpWithReadfirstlane(MI, MRI, 5); return; } - default: - break; + case Intrinsic::amdgcn_sbfe: + applyMappingBFEIntrinsic(OpdMapper, true); + return; + case Intrinsic::amdgcn_ubfe: + applyMappingBFEIntrinsic(OpdMapper, false); + return; } break; } @@ -2687,7 +2751,11 @@ SmallVector OpdsMapping(MI.getNumOperands()); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); + const MachineOperand &SrcOp = MI.getOperand(i); + if (!SrcOp.isReg()) + continue; + + unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); } return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), @@ -3497,8 +3565,6 @@ case Intrinsic::amdgcn_fmad_ftz: case Intrinsic::amdgcn_mbcnt_lo: case Intrinsic::amdgcn_mbcnt_hi: - case Intrinsic::amdgcn_ubfe: - case Intrinsic::amdgcn_sbfe: case Intrinsic::amdgcn_mul_u24: case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_lerp: @@ -3520,6 +3586,11 @@ case Intrinsic::amdgcn_sdot8: case Intrinsic::amdgcn_udot8: return getDefaultMappingVOP(MI); + case Intrinsic::amdgcn_sbfe: + case Intrinsic::amdgcn_ubfe: + if (isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_ds_swizzle: case Intrinsic::amdgcn_ds_permute: case Intrinsic::amdgcn_ds_bpermute: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -0,0 +1,962 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s + +define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 { +; GFX6-LABEL: v_bfe_i32_arg_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2) + ret i32 %bfe_i32 +} + +define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { +; GFX6-LABEL: s_bfe_i32_arg_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 63 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_bfe_i32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2) + ret i32 %bfe_i32 +} + +; TODO: Need to expand this +; define i64 @v_bfe_i64_arg_arg_arg(i64 %src0, i32 %src1, i32 %src2) #0 { +; %bfe_i64 = call i32 @llvm.amdgcn.sbfe.i64(i32 %src0, i32 %src1, i32 %src2) +; ret i64 %bfe_i64 +; } + +define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { +; GFX6-LABEL: s_bfe_i64_arg_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s2, s2, 63 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], s2 +; GFX6-NEXT: ; return to shader part epilog + %bfe_i32 = call i64 @llvm.amdgcn.sbfe.i64(i64 %src0, i32 %src1, i32 %src2) + ret i64 %bfe_i32 +} + +define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; GFX6-LABEL: bfe_i32_arg_arg_imm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_or_b32 s0, s0, 0x7b0000 +; GFX6-NEXT: s_bfe_i32 s0, s2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { +; GFX6-LABEL: bfe_i32_arg_imm_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, 59, s0 +; GFX6-NEXT: s_bfe_i32 s0, s2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { +; GFX6-LABEL: bfe_i32_imm_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s1, s2, 63 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_bfe_i32 s0, 0x7b, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 { +; GFX6-LABEL: v_bfe_print_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80002 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %src0, align 4 + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_bfe_i32 s0, s2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; GFX6-LABEL: bfe_i32_arg_0_width_imm_offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_6: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1f0000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_9: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_10: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_11: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180008 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_12: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80018 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_13: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_i32_test_14: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_i32 s2, 0, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_i32 s2, 0x302e, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_i32 s2, 0, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_3: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_i32 s2, 1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_i32 s3, -1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x10007 +; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_6: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80000 +; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80000 +; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80006 +; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_9: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80010 +; GFX6-NEXT: s_bfe_i32 s2, 0x10000, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_10: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x100010 +; GFX6-NEXT: s_bfe_i32 s2, 0xffff, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_11: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x40004 +; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_12: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1001f +; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_13: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x100010 +; GFX6-NEXT: s_bfe_i32 s2, 0x1fffe, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_14: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 +; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 +; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_i32 s3, -1, 0x70001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_17: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 +; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_i32_constant_fold_test_18: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1001f +; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_sext_in_reg_i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX6-NEXT: s_lshl_b32 s0, s0, 8 +; GFX6-NEXT: s_ashr_i32 s0, s0, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24) + %shl = shl i32 %bfe, 8 + %ashr = ashr i32 %shl, 8 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME +; define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; %src = load i32, i32 addrspace(1)* %in, align 4 +; %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) +; %div = sdiv i32 %bfe, 2 +; store i32 %div, i32 addrspace(1)* %out, align 4 +; ret void +; } + +define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; GFX6-LABEL: bfe_0_width: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; GFX6-LABEL: bfe_8_bfe_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_mov_b32 s1, 0x80000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, s1 +; GFX6-NEXT: s_bfe_i32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) + %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8) + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; GFX6-LABEL: bfe_8_bfe_16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) + %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 16) + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; This really should be folded into 1 +define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; GFX6-LABEL: bfe_16_bfe_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100000 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16) + %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8) + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure there isn't a redundant BFE +define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s2, s2, s0 +; GFX6-NEXT: s_bfe_i32 s0, s2, 0x80000 +; GFX6-NEXT: s_lshl_b32 s0, s0, 24 +; GFX6-NEXT: s_ashr_i32 s0, s0, 24 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s2, s2, s0 +; GFX6-NEXT: s_bfe_i32 s0, s2, 8 +; GFX6-NEXT: s_lshl_b32 s0, s0, 24 +; GFX6-NEXT: s_ashr_i32 s0, s0, 24 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { +; GFX6-LABEL: sextload_i8_to_i32_bfe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 0, i32 8) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { +; GFX6-LABEL: sextload_i8_to_i32_bfe_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 8, 0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 8, i32 0) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 30 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10001 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 30 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20001 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 2) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) #1 +declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -0,0 +1,1079 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s + +define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 { +; GFX6-LABEL: v_bfe_i32_arg_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_u32 v0, v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %bfe_i32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src2) + ret i32 %bfe_i32 +} + +define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { +; GFX6-LABEL: s_bfe_i32_arg_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 63 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_bfe_u32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog + %bfe_i32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src2) + ret i32 %bfe_i32 +} + +; TODO: Need to expand this. +; define i64 @v_bfe_i64_arg_arg_arg(i64 %src0, i32 %src1, i32 %src2) #0 { +; %bfe_i64 = call i32 @llvm.amdgcn.ubfe.i64(i32 %src0, i32 %src1, i32 %src2) +; ret i64 %bfe_i64 +; } + +define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { +; GFX6-LABEL: s_bfe_i64_arg_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s2, s2, 63 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_bfe_u64 s[0:1], s[0:1], s2 +; GFX6-NEXT: ; return to shader part epilog + %bfe_i32 = call i64 @llvm.amdgcn.ubfe.i64(i64 %src0, i32 %src1, i32 %src2) + ret i64 %bfe_i32 +} + +define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { +; GFX6-LABEL: bfe_u32_arg_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s1, s0, 63 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_bfe_u32 s0, s2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; GFX6-LABEL: bfe_u32_arg_arg_imm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_or_b32 s0, s0, 0x7b0000 +; GFX6-NEXT: s_bfe_u32 s0, s2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { +; GFX6-LABEL: bfe_u32_arg_imm_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, 59, s0 +; GFX6-NEXT: s_bfe_u32 s0, s2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { +; GFX6-LABEL: bfe_u32_imm_arg_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s1, s2, 63 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_bfe_u32 s0, 0x7b, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; GFX6-LABEL: bfe_u32_arg_0_width_reg_offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_bfe_u32 s0, s2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +; GFX6-LABEL: bfe_u32_arg_0_width_imm_offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s0, s0, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_zextload_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 8 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %in + %ext = zext i8 %load to i32 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: Should be using s_add_i32 +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_zext_in_reg_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_zext_in_reg_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80001 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_3: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80003 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80007 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_zext_in_reg_i16_offset_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_3: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_lshr_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = lshr i32 %shl, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_6: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1f0000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_9: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_10: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_11: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x180008 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_12: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80018 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_13: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: bfe_u32_test_14: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s0, s0, 31 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_u32 s2, 0, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_u32 s2, 0x302e, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_u32 s2, 0, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_3: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_u32 s2, 1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_u32 s3, -1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x10007 +; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_6: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80000 +; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80000 +; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80006 +; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_9: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x80010 +; GFX6-NEXT: s_bfe_u32 s2, 0x10000, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_10: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x100010 +; GFX6-NEXT: s_bfe_u32 s2, 0xffff, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_11: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x40004 +; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_12: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1001f +; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_13: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x100010 +; GFX6-NEXT: s_bfe_u32 s2, 0x1fffe, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_14: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 +; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 +; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_bfe_u32 s3, -1, 0x70001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_17: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 +; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { +; GFX6-LABEL: bfe_u32_constant_fold_test_18: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, 0x1001f +; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure that SimplifyDemandedBits doesn't cause the and to be +; reduced to the bits demanded by the bfe. + +; XXX: The operand to v_bfe_u32 could also just directly be the load register. +define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, +; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out1, + i32 addrspace(1)* %in) #0 { + %src = load i32, i32 addrspace(1)* %in, align 4 + %and = and i32 %src, 63 + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2) + store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 + store i32 %and, i32 addrspace(1)* %out1, align 4 + ret void +} + +define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { +; GFX6-LABEL: lshr_and: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s0, s0, 6 +; GFX6-NEXT: s_and_b32 s0, s0, 7 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %b = lshr i32 %a, 6 + %c = and i32 %b, 7 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; GFX6-LABEL: v_lshr_and: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s0, s2, s0 +; GFX6-NEXT: s_and_b32 s0, s0, 7 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %c = lshr i32 %a, %b + %d = and i32 %c, 7 + store i32 %d, i32 addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { +; GFX6-LABEL: and_lshr: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 0x1c0 +; GFX6-NEXT: s_lshr_b32 s0, s0, 6 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %b = and i32 %a, 448 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { +; GFX6-LABEL: and_lshr2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 0x1ff +; GFX6-NEXT: s_lshr_b32 s0, s0, 6 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %b = and i32 %a, 511 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { +; GFX6-LABEL: shl_lshr: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s0, 9 +; GFX6-NEXT: s_lshr_b32 s0, s0, 11 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %b = shl i32 %a, 9 + %c = lshr i32 %b, 11 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) #1 +declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }