diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1857,6 +1857,11 @@
     [IntrNoMem, IntrSpeculatable]
 >;
 
+def int_amdgcn_bfi : DefaultAttrsIntrinsic<[llvm_i32_ty],
+    [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem, IntrSpeculatable]
+>;
+
 def int_amdgcn_lerp :
   ClangBuiltin<"__builtin_amdgcn_lerp">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -607,6 +607,27 @@
     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
     return;
   }
+  case AMDGPUISD::BFI: {
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(0));
+    if (!Mask)
+      break;
+
+    if (Mask->getZExtValue() == 0) {
+      ReplaceNode(N, N->getOperand(2).getNode());
+      return;
+    }
+
+    if (Mask->getSExtValue() == -1) {
+      ReplaceNode(N, N->getOperand(1).getNode());
+      return;
+    }
+
+    const SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+    SDNode *BFI = CurDAG->getMachineNode(AMDGPU::V_BFI_B32_e64, SDLoc(N), N->getValueType(0), Ops);
+    ReplaceNode(N, BFI);
+    return;
+  }
+
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     // There is a scalar version available, but unlike the vector version which
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -270,7 +270,7 @@
 
 def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
-def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfi_impl : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
 def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
@@ -474,6 +474,10 @@
   [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2),
    (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>;
 
+def AMDGPUbfi : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_bfi node:$src0, node:$src1, node:$src2),
+   (AMDGPUbfi_impl node:$src0, node:$src1, node:$src2)]>;
+
 def AMDGPUfmul_legacy : PatFrags<(ops node:$src0, node:$src1),
   [(int_amdgcn_fmul_legacy node:$src0, node:$src1),
    (AMDGPUfmul_legacy_impl node:$src0, node:$src1)]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4267,6 +4267,7 @@
     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
+    case Intrinsic::amdgcn_bfi:
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_sbfe:
     case Intrinsic::amdgcn_ubfe:
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -409,7 +409,7 @@
 >;
 
 def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
-  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
+  [(set i32:$dst, (AMDGPUbfi_impl i32:$src0, i32:$src1, i32:$src2))],
   VecALU
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUISelLowering.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -7276,6 +7277,8 @@
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::amdgcn_sffbh:
     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_bfi:
+    return DAG.getNode(AMDGPUISD::BFI, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_sbfe:
     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
diff --git a/llvm/test/CodeGen/AMDGPU/bfi-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/bfi-intrinsic.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bfi-intrinsic.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+define i32 @v_bfi_simple(i32 %x, i32 %y, i32 %z) {
+; GCN-LABEL: v_bfi_simple:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_movk_i32 s4, 0x400
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %bfi = call i32 @llvm.amdgcn.bfi(i32 1024, i32 %x, i32 %y)
+  ret i32 %bfi
+}
+
+define i32 @v_bfi(i32 %x, i32 %y, i32 %z) {
+; GCN-LABEL: v_bfi:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfi_b32 v0, 1, v0, v1
+; GCN-NEXT:    v_bfi_b32 v0, 8, v0, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %bfi = call i32 @llvm.amdgcn.bfi(i32 1, i32 %x, i32 %y)
+  %bfi.1 = call i32 @llvm.amdgcn.bfi(i32 8, i32 %bfi, i32 %z)
+  ret i32 %bfi.1
+}
+
+define i32 @v_bfi_zero_mask(i32 %x, i32 %y, i32 %z) {
+; GCN-LABEL: v_bfi_zero_mask:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_lo_u32 v0, v1, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %bfi = call i32 @llvm.amdgcn.bfi(i32 0, i32 %x, i32 %y)
+  %ret = mul i32 %bfi, %z
+  ret i32 %ret
+}
+
+define i32 @v_bfi_minus_one_mask(i32 %x, i32 %y, i32 %z) {
+; GCN-LABEL: v_bfi_minus_one_mask:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %bfi = call i32 @llvm.amdgcn.bfi(i32 -1, i32 %x, i32 %y)
+  %ret = mul i32 %bfi, %z
+  ret i32 %ret
+}
+
+define i32 @v_bfi_non_const_mask(i32 %x, i32 %y, i32 %z, i32 %mask) {
+; GCN-LABEL: v_bfi_non_const_mask:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfi_b32 v0, v3, v0, v1
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %bfi = call i32 @llvm.amdgcn.bfi(i32 %mask, i32 %x, i32 %y)
+  %ret = mul i32 %bfi, %z
+  ret i32 %ret
+}
+
+declare i32 @llvm.amdgcn.bfi(i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ /dev/null
@@ -1,305 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-
-define float @v_bfi_single_nesting_level(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_single_nesting_level:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc00003ff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
-  %mul.base.i32 = fptoui float %mul.base to i32
-  %y.i32 = fptoui float %y to i32
-  %shl.inner.insert = shl i32 %y.i32, 10
-  %bfi1.and = and i32 %shl.inner.insert, 1047552
-  %bfi1.andnot = and i32 %mul.base.i32, -1073740801
-  %bfi1.or = or i32 %bfi1.and, %bfi1.andnot
-  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
-  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
-  %and.outer = and i32 %shl.outer.insert, 1072693248
-  %or.outer = or i32 %bfi1.or, %and.outer
-  %result = bitcast i32 %or.outer to float
-  ret float %result
-}
-
-define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_single_nesting_level_swapped_operands:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc00003ff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
-  %mul.base.i32 = fptoui float %mul.base to i32
-  %y.i32 = fptoui float %y to i32
-  %shl.inner.insert = shl i32 %y.i32, 10
-  %bfi1.and = and i32 1047552, %shl.inner.insert
-  %bfi1.andnot = and i32 -1073740801, %mul.base.i32
-  %bfi1.or = or i32 %bfi1.and, %bfi1.andnot
-  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
-  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
-  %and.outer = and i32 %shl.outer.insert, 1072693248
-  %or.outer = or i32 %and.outer, %bfi1.or
-  %result = bitcast i32 %or.outer to float
-  ret float %result
-}
-
-define float @v_bfi_single_nesting_level_unbalanced_subtree(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_single_nesting_level_unbalanced_subtree:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v3, 0x3e0, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
-  %mul.base.i32 = fptoui float %mul.base to i32
-  %y.i32 = fptoui float %y to i32
-  %shl.inner.2.insert = shl i32 %y.i32, 10
-  %bfi.inner.2.and.1 = and i32 %shl.inner.2.insert, 1047552
-  %bfi.inner.2.and.2 = and i32 %mul.base.i32, 992
-  %bfi.inner.2 = or i32 %bfi.inner.2.and.1, %bfi.inner.2.and.2
-  %mul.inner.1.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %mul.inner.1.insert.1.i32 = fptoui float %mul.inner.1.insert to i32
-  %shl.inner.1.insert.1 = shl i32 %mul.inner.1.insert.1.i32, 20
-  %bfi.inner.1.and.1 = and i32 %shl.inner.1.insert.1, 1072693248
-  %bfi.inner.1.and.2 = and i32 %mul.base.i32, -1073741793
-  %bfi.inner.1 = or i32 %bfi.inner.1.and.2, %bfi.inner.2
-  %bfi.outer = or i32 %bfi.inner.1.and.1, %bfi.inner.1
-  %result = bitcast i32 %bfi.outer to float
-  ret float %result
-}
-
-define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_single_nesting_level_inner_use:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0x400003ff, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
-  %mul.base.i32 = fptoui float %mul.base to i32
-  %y.i32 = fptoui float %y to i32
-  %shl.inner.insert = shl i32 %y.i32, 10
-  %bfi1.and = and i32 %shl.inner.insert, 1047552
-  %bfi1.andnot = and i32 %mul.base.i32, -1073740801
-  %bfi1.or = or i32 %bfi1.and, %bfi1.andnot
-  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
-  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
-  %and.outer = and i32 %shl.outer.insert, 1072693248
-  %or.outer = or i32 %bfi1.or, %and.outer
-  %bfi1.or.seconduse = mul i32 %bfi1.or, 2
-  %result = bitcast i32 %bfi1.or.seconduse to float
-  ret float %result
-}
-
-define float @v_bfi_no_nesting(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_no_nesting:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc0000400, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
-  %mul.base.i32 = fptoui float %mul.base to i32
-  %y.i32 = fptoui float %y to i32
-  %shl.inner.insert = shl i32 %y.i32, 10
-  %inner.and = and i32 %shl.inner.insert, 1047552
-  %inner.and2 = and i32 %mul.base.i32, -1073740800
-  %inner.or = or i32 %inner.and, %inner.and2
-  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
-  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
-  %and.outer = and i32 %shl.outer.insert, 1072693248
-  %or.outer = or i32 %inner.or, %and.outer
-  %result = bitcast i32 %or.outer to float
-  ret float %result
-}
-
-define float @v_bfi_two_levels(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_two_levels:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 5, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0x3e0, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %y.i32 = fptoui float %y to i32
-  %shl.insert.inner = shl i32 %y.i32, 5
-  %and.insert.inner = and i32 %shl.insert.inner, 992
-  %z.i32 = fptoui float %z to i32
-  %base.inner = and i32 %z.i32, -1073741793
-  %or.inner = or i32 %and.insert.inner , %base.inner
-  %shl.insert.mid = shl i32 %y.i32, 10
-  %and.insert.mid = and i32 %shl.insert.mid, 1047552
-  %or.mid = or i32 %or.inner, %and.insert.mid
-  %fmul.insert.outer = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %cast.insert.outer = fptoui float %fmul.insert.outer to i32
-  %shl.insert.outer = shl i32 %cast.insert.outer, 20
-  %and.insert.outer = and i32 %shl.insert.outer, 1072693248
-  %or.outer = or i32 %or.mid, %and.insert.outer
-  %result = bitcast i32 %or.outer to float
-  ret float %result
-}
-
-define float @v_bfi_two_levels_inner_or_multiple_uses(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_two_levels_inner_or_multiple_uses:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 5, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0x3e0, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %y.i32 = fptoui float %y to i32
-  %shl.insert.inner = shl i32 %y.i32, 5
-  %and.insert.inner = and i32 %shl.insert.inner, 992
-  %z.i32 = fptoui float %z to i32
-  %base.inner = and i32 %z.i32, -1073741793
-  %or.inner = or i32 %and.insert.inner , %base.inner
-  %shl.insert.mid = shl i32 %y.i32, 10
-  %and.insert.mid = and i32 %shl.insert.mid, 1047552
-  %or.mid = or i32 %or.inner, %and.insert.mid
-  %fmul.insert.outer = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %cast.insert.outer = fptoui float %fmul.insert.outer to i32
-  %shl.insert.outer = shl i32 %cast.insert.outer, 20
-  %and.insert.outer = and i32 %shl.insert.outer, 1072693248
-  %or.outer = or i32 %or.mid, %and.insert.outer
-  %result = bitcast i32 %or.outer to float
-  %or.inner.float = bitcast i32 %or.inner to float
-  %result2 = fmul float %result, %or.inner.float
-  ret float %result2
-}
-
-define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) {
-; GCN-LABEL: v_bfi_single_constant_as_partition:
-; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-.entry:
-  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
-  %mul.base.i32 = fptoui float %mul.base to i32
-  %y.i32 = fptoui float %y to i32
-  %shl.inner.insert = shl i32 %y.i32, 10
-  %bfi1.or = or i32 %shl.inner.insert, %mul.base.i32
-  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
-  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
-  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
-  %and.outer = and i32 %shl.outer.insert, -1
-  %or.outer = or i32 %bfi1.or, %and.outer
-  %result = bitcast i32 %or.outer to float
-  ret float %result
-}
-
-define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) {
-; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s3, s3, 0xffff0000
-; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
-; GCN-NEXT:    s_or_b32 s2, s2, s3
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-NEXT:    s_endpgm
-  %shift = lshr i32 %b, 16
-  %tr = trunc i32 %shift to i16
-  %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
-  %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
-  %val = bitcast <2 x i16> %vec to i32
-  store i32 %val, ptr addrspace(1) %out, align 4
-  ret void
-}