diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -262,6 +262,7 @@
   void SelectFMUL_W_CHAIN(SDNode *N);
   SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,
                    uint32_t Width);
+  bool SelectV_BFI(SDNode *N);
   void SelectS_BFEFromShifts(SDNode *N);
   void SelectS_BFE(SDNode *N);
   bool isCBranchSCC(const SDNode *N) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -674,6 +674,11 @@
 
     SelectS_BFE(N);
     return;
+  case ISD::OR:
+    if (SelectV_BFI(N))
+      return;
+
+    break;    
   case ISD::BRCOND:
     SelectBRCOND(N);
     return;
@@ -2146,6 +2151,89 @@
   SelectCode(N);
 }
 
+// Try matching a 32-bit expression
+// (X_1 & C_1) | (X_2 & C_2) | ... | (X_n & C_n)
+// with the constants C_i. If matching succeeds, convert 
+// it to a sequence of nested V_BFI instructions, if possible.
+// Creating the canonical bitfieldInsert form in the frontend 
+// leads to simplification of the used bitmasks through SimplifyDemandedBits, 
+// where one bitfieldInsert is the base for another bitfieldInsert.
+// Originally, the intended expression looks like:
+// (X1 & C1) | (~C1 & ((X2 & C2) | (X3 & ~C3)))
+// However, after SimplifyDemandedBits ISel, the expression will look like:
+// (X1 & C1) | ((X2 & C2) | (X3 & (~C1 | ~C3)))
+bool AMDGPUDAGToDAGISel::SelectV_BFI(SDNode *N) {
+  struct BFIOperandData {
+    SDNode *X;
+    uint32_t CVal;
+  };
+
+  SmallVector<BFIOperandData, 3> BFIOps;
+  uint32_t Disjunction = 0;
+
+  auto InsertBFIOp = [&](SDNode *And) -> bool {
+    if (!And->hasOneUse())
+      return false;
+
+    ConstantSDNode *Base = dyn_cast<ConstantSDNode>(And->getOperand(1));
+    if (!Base)
+      return false;
+
+    const uint32_t Mask = Base->getZExtValue();
+    // The masks need to be disjoint
+    if ((Disjunction & Mask) != 0)
+      return false;
+
+    Disjunction |= Mask;
+    BFIOps.push_back({And, Mask});
+    return true;
+  };
+
+  // Fetch all AND nodes as well as the values from their constant operands
+  SDNode *CurrentOr = N;
+  while (CurrentOr && (CurrentOr == N || CurrentOr->hasOneUse())) {
+    SDValue O0 = CurrentOr->getOperand(0);
+    SDValue O1 = CurrentOr->getOperand(1);
+    if (O1.getOpcode() != ISD::AND)
+      return false;
+
+    if (O0.getOpcode() == ISD::OR) {
+      if (!InsertBFIOp(O1.getNode()))
+        return false;
+
+      CurrentOr = O0.getNode();
+    } else if (O0.getOpcode() == ISD::AND) {
+      if (!InsertBFIOp(O0.getNode()) || !InsertBFIOp(O1.getNode()))
+        return false;
+
+      break;
+    } else {
+      return false;
+    }
+  }
+
+  // Require that at least two nesting levels have been visited
+  // The constants are required to be a partition of -1
+  if (CurrentOr == N || Disjunction != 0xFFFFFFFF)
+    return false;
+
+  // Transform the operands node by node and replace the original node.
+  // Start with the bottom-most node.
+  SDNode *NewNode = BFIOps.back().X->getOperand(0).getNode();
+
+  assert(NewNode && "Cannot use non-existing node as base!");
+  for (auto &BFITuple : reverse(makeArrayRef(BFIOps).drop_back())) {
+    NewNode = CurDAG->getMachineNode(
+        AMDGPU::V_BFI_B32_e64, SDLoc(N), N->getValueType(0),
+        {BFITuple.X->getOperand(1), BFITuple.X->getOperand(0),
+         SDValue(NewNode, 0)});
+  }
+
+  ReplaceNode(N, NewNode);
+
+  return true;
+}
+
 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
   switch (N->getOpcode()) {
   case ISD::AND:
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -15,17 +15,15 @@
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xc00003ff, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffc00, v1, v2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x3ff00000, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -114,18 +112,16 @@
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 5, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_bfi_b32 v2, 0x3e0, v3, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX10-NEXT:    v_and_or_b32 v2, 0x3e0, v3, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX10-NEXT:    v_or3_b32 v0, v2, v1, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffc00, v1, v2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x3ff00000, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %y.i32 = fptoui float %y to i32