diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -262,6 +262,7 @@
   void SelectFMUL_W_CHAIN(SDNode *N);
   SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,
                    uint32_t Width);
+  bool SelectV_BFI(SDNode *N);
   void SelectS_BFEFromShifts(SDNode *N);
   void SelectS_BFE(SDNode *N);
   bool isCBranchSCC(const SDNode *N) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -674,6 +674,11 @@
 
     SelectS_BFE(N);
     return;
+  case ISD::OR:
+    if (SelectV_BFI(N))
+      return;
+
+    break;
   case ISD::BRCOND:
     SelectBRCOND(N);
     return;
@@ -2146,6 +2151,136 @@
   SelectCode(N);
 }
 
+// Try matching a 32-bit expression
+// (X_1 & C_1) | (X_2 & C_2) | ... | (X_n & C_n)
+// with the constants C_i. If matching succeeds, convert
+// it to a sequence of nested V_BFI instructions, if possible.
+// Creating the canonical bitfieldInsert form in the frontend
+// leads to simplification of the used bitmasks through SimplifyDemandedBits,
+// where one bitfieldInsert is the base for another bitfieldInsert.
+// Originally, the intended expression looks like:
+// (X1 & C1) | (~C1 & ((X2 & C2) | (X3 & ~C3)))
+// However, after SimplifyDemandedBits ISel, the expression will look like:
+// (X1 & C1) | ((X2 & C2) | (X3 & (~C1 | ~C3)))
+bool AMDGPUDAGToDAGISel::SelectV_BFI(SDNode *N) {
+  if (!N->isDivergent())
+    return false;
+
+  struct MaskData {
+    SDNode *BaseOperand;
+    SDNode *MaskOperand;
+    llvm::Optional<uint64_t> Mask;
+  };
+
+  struct BFIOperandData {
+    SDNode *Or;
+    MaskData Mask;
+  };
+
+  SmallVector<BFIOperandData, 3> BFIOps;
+  uint64_t Disjunction = 0;
+
+  auto ExtractMask = [](SDNode *And) -> MaskData {
+    ConstantSDNode *MaskOperand = dyn_cast<ConstantSDNode>(And->getOperand(0));
+    SDNode *BaseOperand = And->getOperand(1).getNode();
+    if (!MaskOperand) {
+      BaseOperand = And->getOperand(0).getNode();
+      MaskOperand = dyn_cast<ConstantSDNode>(And->getOperand(1));
+    }
+    if (!MaskOperand)
+      return {};
+
+    return {BaseOperand, MaskOperand, MaskOperand->getZExtValue()};
+  };
+
+  // Check if this is an AND with a constant operand.
+  // Check if the mask from the constant operand fulfills several
+  // conditions and update the total mask as well as saving the operand
+  // nodes.
+  enum class BFICheckStatus : uint8_t { Proceed, Abort };
+  auto CreateBFIOpFromAnd = [&](SDNode *Or, SDNode *And) -> BFICheckStatus {
+    if (And->getOpcode() != ISD::AND)
+      return BFICheckStatus::Proceed;
+    const auto MaskData = ExtractMask(And);
+    if (!MaskData.Mask.has_value())
+      return BFICheckStatus::Proceed;
+
+    // The masks need to be disjoint
+    if ((Disjunction & MaskData.Mask.value()) != 0)
+      return BFICheckStatus::Abort;
+
+    Disjunction |= MaskData.Mask.value();
+    BFIOps.push_back({Or, MaskData});
+    return BFICheckStatus::Proceed;
+  };
+
+  // Collect all OR instructions, so the constants masks from the AND
+  // instructions can be fetched later.
+  SmallVector<SDNode *, 2> OrStack;
+  SmallVector<SDNode *, 2> OrCandidates;
+  OrStack.push_back(N);
+  OrCandidates.push_back(N);
+  auto IsCandidate = [](SDValue Or) -> bool {
+    return Or->isDivergent() && Or->getOpcode() == ISD::OR && Or->hasOneUse();
+  };
+
+  // Push all possible OR candidates including the function argument on a stack
+  while (!OrStack.empty()) {
+    SDNode *Current = OrStack.back();
+    OrStack.pop_back();
+
+    SDValue O0 = Current->getOperand(0);
+    SDValue O1 = Current->getOperand(1);
+
+    if (IsCandidate(O0)) {
+      OrStack.push_back(O0.getNode());
+      OrCandidates.push_back(O0.getNode());
+    }
+
+    if (IsCandidate(O1)) {
+      OrStack.push_back(O1.getNode());
+      OrCandidates.push_back(O1.getNode());
+    }
+
+    if (O0.getOpcode() != ISD::OR && O1.getOpcode() != ISD::OR)
+      break;
+  }
+
+  // Recording the AND operands and bitmasks and return false,
+  // if there was a failure (inappropriate bitmask).
+  // This happens as a separate step so the top-level node does not
+  // need to be treated separately.
+  while (!OrCandidates.empty()) {
+    SDNode *Current = OrCandidates.back();
+    OrCandidates.pop_back();
+    SDValue LHS = Current->getOperand(0);
+    SDValue RHS = Current->getOperand(1);
+
+    if (CreateBFIOpFromAnd(Current, RHS.getNode()) == BFICheckStatus::Abort ||
+        CreateBFIOpFromAnd(Current, LHS.getNode()) == BFICheckStatus::Abort)
+      return false;
+  }
+
+  // The constants are required to be a partition of -1.
+  if (Disjunction != 0xFFFFFFFF)
+    return false;
+
+  // Transform the operands node by node and replace the original node.
+  // Assume the last node is the base for BFI insertion.
+  SDNode *BaseNode = BFIOps.front().Mask.BaseOperand;
+  assert(BaseNode && "Cannot use non-existing node as base!");
+  for (auto &BFITuple : makeArrayRef(BFIOps).drop_front()) {
+    BaseNode = CurDAG->getMachineNode(
+        AMDGPU::V_BFI_B32_e64, SDLoc(BFITuple.Or), BFITuple.Or->getValueType(0),
+        {SDValue(BFITuple.Mask.MaskOperand, 0),
+         SDValue(BFITuple.Mask.BaseOperand, 0), SDValue(BaseNode, 0)});
+  }
+
+  ReplaceNode(N, BaseNode);
+
+  return true;
+}
+
 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
   switch (N->getOpcode()) {
   case ISD::AND:
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -7,16 +7,15 @@
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    s_mov_b32 s4, 0xffc00
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc00003ff, v2
+; GCN-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x3ff00000
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -41,16 +40,15 @@
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    s_mov_b32 s4, 0xffc00
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc00003ff, v2
+; GCN-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_mov_b32 s4, 0x3ff00000
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -75,18 +73,17 @@
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    s_mov_b32 s4, 0xffc00
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
+; GCN-NEXT:    s_mov_b32 s5, 0xc000001f
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GCN-NEXT:    v_and_b32_e32 v3, 0x3e0, v2
+; GCN-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_bfi_b32 v1, s5, v2, v1
+; GCN-NEXT:    s_mov_b32 s4, 0x3ff00000
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -178,18 +175,17 @@
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    s_movk_i32 s4, 0x3e0
+; GCN-NEXT:    s_mov_b32 s5, 0xffc00
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 5, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0x3e0, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
+; GCN-NEXT:    v_bfi_b32 v2, s4, v3, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_bfi_b32 v1, s5, v1, v2
+; GCN-NEXT:    s_mov_b32 s4, 0x3ff00000
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %y.i32 = fptoui float %y to i32