diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1857,6 +1857,11 @@ [IntrNoMem, IntrSpeculatable] >; +def int_amdgcn_bfi : DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; + def int_amdgcn_lerp : ClangBuiltin<"__builtin_amdgcn_lerp">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -607,6 +607,27 @@ ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); return; } + case AMDGPUISD::BFI: { + ConstantSDNode *Mask = dyn_cast(N->getOperand(0)); + if (!Mask) + break; + + if (Mask->getZExtValue() == 0) { + ReplaceNode(N, N->getOperand(2).getNode()); + return; + } + + if (Mask->getSExtValue() == -1) { + ReplaceNode(N, N->getOperand(1).getNode()); + return; + } + + const SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2) }; + SDNode *BFI = CurDAG->getMachineNode(AMDGPU::V_BFI_B32_e64, SDLoc(N), N->getValueType(0), Ops); + ReplaceNode(N, BFI); + return; + } + case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { // There is a scalar version available, but unlike the vector version which diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -270,7 +270,7 @@ def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; -def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; +def AMDGPUbfi_impl : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>; @@ -474,6 +474,10 @@ [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2), (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>; +def AMDGPUbfi : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_bfi node:$src0, node:$src1, node:$src2), + (AMDGPUbfi_impl node:$src0, node:$src1, node:$src2)]>; + def AMDGPUfmul_legacy : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_fmul_legacy node:$src0, node:$src1), (AMDGPUfmul_legacy_impl node:$src0, node:$src1)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4267,6 +4267,7 @@ case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: + case Intrinsic::amdgcn_bfi: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_sbfe: case Intrinsic::amdgcn_ubfe: diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -409,7 +409,7 @@ >; def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", - [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], + [(set i32:$dst, (AMDGPUbfi_impl i32:$src0, i32:$src1, i32:$src2))], VecALU >; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13,6 +13,7 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUISelLowering.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -7276,6 +7277,8 @@ Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_sffbh: return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_bfi: + return DAG.getNode(AMDGPUISD::BFI, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_sbfe: return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); diff --git a/llvm/test/CodeGen/AMDGPU/bfi-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/bfi-intrinsic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bfi-intrinsic.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +define i32 @v_bfi_simple(i32 %x, i32 %y, i32 %z) { +; GCN-LABEL: v_bfi_simple: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_movk_i32 s4, 0x400 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %bfi = call i32 @llvm.amdgcn.bfi(i32 1024, i32 %x, i32 %y) + ret i32 %bfi +} + +define i32 @v_bfi(i32 %x, i32 %y, i32 %z) { +; GCN-LABEL: v_bfi: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfi_b32 v0, 1, v0, v1 +; GCN-NEXT: v_bfi_b32 v0, 8, v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %bfi = call i32 @llvm.amdgcn.bfi(i32 1, i32 %x, i32 %y) + %bfi.1 = call i32 @llvm.amdgcn.bfi(i32 8, i32 %bfi, i32 %z) + ret i32 %bfi.1 +} + +define i32 @v_bfi_zero_mask(i32 %x, i32 %y, i32 %z) { +; GCN-LABEL: v_bfi_zero_mask: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v0, v1, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %bfi = call i32 @llvm.amdgcn.bfi(i32 0, i32 %x, i32 %y) + %ret = mul i32 %bfi, %z + ret i32 %ret +} + +define i32 @v_bfi_minus_one_mask(i32 %x, i32 %y, i32 %z) { +; GCN-LABEL: v_bfi_minus_one_mask: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %bfi = call i32 @llvm.amdgcn.bfi(i32 -1, i32 %x, i32 %y) + %ret = mul i32 %bfi, %z + ret i32 %ret +} + +define i32 @v_bfi_non_const_mask(i32 %x, i32 %y, i32 %z, i32 %mask) { +; GCN-LABEL: v_bfi_non_const_mask: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfi_b32 v0, v3, v0, v1 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %bfi = call i32 @llvm.amdgcn.bfi(i32 %mask, i32 %x, i32 %y) + %ret = mul i32 %bfi, %z + ret i32 %ret +} + +declare i32 @llvm.amdgcn.bfi(i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ /dev/null @@ -1,305 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s - -define float @v_bfi_single_nesting_level(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_single_nesting_level: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 - %mul.base.i32 = fptoui float %mul.base to i32 - %y.i32 = fptoui float %y to i32 - %shl.inner.insert = shl i32 %y.i32, 10 - %bfi1.and = and i32 %shl.inner.insert, 1047552 - %bfi1.andnot = and i32 %mul.base.i32, -1073740801 - %bfi1.or = or i32 %bfi1.and, %bfi1.andnot - %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32 - %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20 - %and.outer = and i32 %shl.outer.insert, 1072693248 - %or.outer = or i32 %bfi1.or, %and.outer - %result = bitcast i32 %or.outer to float - ret float %result -} - -define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_single_nesting_level_swapped_operands: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 - %mul.base.i32 = fptoui float %mul.base to i32 - %y.i32 = fptoui float %y to i32 - %shl.inner.insert = shl i32 %y.i32, 10 - %bfi1.and = and i32 1047552, %shl.inner.insert - %bfi1.andnot = and i32 -1073740801, %mul.base.i32 - %bfi1.or = or i32 %bfi1.and, %bfi1.andnot - %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32 - %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20 - %and.outer = and i32 %shl.outer.insert, 1072693248 - %or.outer = or i32 %and.outer, %bfi1.or - %result = bitcast i32 %or.outer to float - ret float %result -} - -define float @v_bfi_single_nesting_level_unbalanced_subtree(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_single_nesting_level_unbalanced_subtree: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 - %mul.base.i32 = fptoui float %mul.base to i32 - %y.i32 = fptoui float %y to i32 - %shl.inner.2.insert = shl i32 %y.i32, 10 - %bfi.inner.2.and.1 = and i32 %shl.inner.2.insert, 1047552 - %bfi.inner.2.and.2 = and i32 %mul.base.i32, 992 - %bfi.inner.2 = or i32 %bfi.inner.2.and.1, %bfi.inner.2.and.2 - %mul.inner.1.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %mul.inner.1.insert.1.i32 = fptoui float %mul.inner.1.insert to i32 - %shl.inner.1.insert.1 = shl i32 %mul.inner.1.insert.1.i32, 20 - %bfi.inner.1.and.1 = and i32 %shl.inner.1.insert.1, 1072693248 - %bfi.inner.1.and.2 = and i32 %mul.base.i32, -1073741793 - %bfi.inner.1 = or i32 %bfi.inner.1.and.2, %bfi.inner.2 - %bfi.outer = or i32 %bfi.inner.1.and.1, %bfi.inner.1 - %result = bitcast i32 %bfi.outer to float - ret float %result -} - -define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_single_nesting_level_inner_use: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x400003ff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 - %mul.base.i32 = fptoui float %mul.base to i32 - %y.i32 = fptoui float %y to i32 - %shl.inner.insert = shl i32 %y.i32, 10 - %bfi1.and = and i32 %shl.inner.insert, 1047552 - %bfi1.andnot = and i32 %mul.base.i32, -1073740801 - %bfi1.or = or i32 %bfi1.and, %bfi1.andnot - %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32 - %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20 - %and.outer = and i32 %shl.outer.insert, 1072693248 - %or.outer = or i32 %bfi1.or, %and.outer - %bfi1.or.seconduse = mul i32 %bfi1.or, 2 - %result = bitcast i32 %bfi1.or.seconduse to float - ret float %result -} - -define float @v_bfi_no_nesting(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_no_nesting: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc0000400, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 - %mul.base.i32 = fptoui float %mul.base to i32 - %y.i32 = fptoui float %y to i32 - %shl.inner.insert = shl i32 %y.i32, 10 - %inner.and = and i32 %shl.inner.insert, 1047552 - %inner.and2 = and i32 %mul.base.i32, -1073740800 - %inner.or = or i32 %inner.and, %inner.and2 - %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32 - %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20 - %and.outer = and i32 %shl.outer.insert, 1072693248 - %or.outer = or i32 %inner.or, %and.outer - %result = bitcast i32 %or.outer to float - ret float %result -} - -define float @v_bfi_two_levels(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_two_levels: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 5, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %y.i32 = fptoui float %y to i32 - %shl.insert.inner = shl i32 %y.i32, 5 - %and.insert.inner = and i32 %shl.insert.inner, 992 - %z.i32 = fptoui float %z to i32 - %base.inner = and i32 %z.i32, -1073741793 - %or.inner = or i32 %and.insert.inner , %base.inner - %shl.insert.mid = shl i32 %y.i32, 10 - %and.insert.mid = and i32 %shl.insert.mid, 1047552 - %or.mid = or i32 %or.inner, %and.insert.mid - %fmul.insert.outer = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %cast.insert.outer = fptoui float %fmul.insert.outer to i32 - %shl.insert.outer = shl i32 %cast.insert.outer, 20 - %and.insert.outer = and i32 %shl.insert.outer, 1072693248 - %or.outer = or i32 %or.mid, %and.insert.outer - %result = bitcast i32 %or.outer to float - ret float %result -} - -define float @v_bfi_two_levels_inner_or_multiple_uses(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_two_levels_inner_or_multiple_uses: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 5, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %y.i32 = fptoui float %y to i32 - %shl.insert.inner = shl i32 %y.i32, 5 - %and.insert.inner = and i32 %shl.insert.inner, 992 - %z.i32 = fptoui float %z to i32 - %base.inner = and i32 %z.i32, -1073741793 - %or.inner = or i32 %and.insert.inner , %base.inner - %shl.insert.mid = shl i32 %y.i32, 10 - %and.insert.mid = and i32 %shl.insert.mid, 1047552 - %or.mid = or i32 %or.inner, %and.insert.mid - %fmul.insert.outer = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %cast.insert.outer = fptoui float %fmul.insert.outer to i32 - %shl.insert.outer = shl i32 %cast.insert.outer, 20 - %and.insert.outer = and i32 %shl.insert.outer, 1072693248 - %or.outer = or i32 %or.mid, %and.insert.outer - %result = bitcast i32 %or.outer to float - %or.inner.float = bitcast i32 %or.inner to float - %result2 = fmul float %result, %or.inner.float - ret float %result2 -} - -define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) { -; GCN-LABEL: v_bfi_single_constant_as_partition: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] -.entry: - %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 - %mul.base.i32 = fptoui float %mul.base to i32 - %y.i32 = fptoui float %y to i32 - %shl.inner.insert = shl i32 %y.i32, 10 - %bfi1.or = or i32 %shl.inner.insert, %mul.base.i32 - %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 - %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32 - %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20 - %and.outer = and i32 %shl.outer.insert, -1 - %or.outer = or i32 %bfi1.or, %and.outer - %result = bitcast i32 %or.outer to float - ret float %result -} - -define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) { -; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 -; GCN-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: s_endpgm - %shift = lshr i32 %b, 16 - %tr = trunc i32 %shift to i16 - %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 - %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1 - %val = bitcast <2 x i16> %vec to i32 - store i32 %val, ptr addrspace(1) %out, align 4 - ret void -}