diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -262,6 +262,7 @@
   void SelectFMUL_W_CHAIN(SDNode *N);
   SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,
                    uint32_t Width);
+  bool SelectV_BFI(SDNode *N);
   void SelectS_BFEFromShifts(SDNode *N);
   void SelectS_BFE(SDNode *N);
   bool isCBranchSCC(const SDNode *N) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -674,6 +674,11 @@
 
     SelectS_BFE(N);
     return;
+  case ISD::OR:
+    if (SelectV_BFI(N))
+      return;
+
+    break;    
   case ISD::BRCOND:
     SelectBRCOND(N);
     return;
@@ -2146,6 +2151,116 @@
   SelectCode(N);
 }
 
+// Try matching a 32-bit expression
+// (X_1 & C_1) | (X_2 & C_2) | ... | (X_n & C_n)
+// with the constants C_i. If matching succeeds, convert 
+// it to a sequence of nested V_BFI instructions, if possible.
+// Creating the canonical bitfieldInsert form in the frontend 
+// leads to simplification of the used bitmasks through SimplifyDemandedBits, 
+// where one bitfieldInsert is the base for another bitfieldInsert.
+// Originally, the intended expression looks like:
+// (X1 & C1) | (~C1 & ((X2 & C2) | (X3 & ~C3)))
+// However, after SimplifyDemandedBits ISel, the expression will look like:
+// (X1 & C1) | ((X2 & C2) | (X3 & (~C1 | ~C3)))
+bool AMDGPUDAGToDAGISel::SelectV_BFI(SDNode *N) {
+  struct BFIOperandData {
+    SDNode *X;
+    uint32_t CVal;
+  };
+
+  SmallVector<BFIOperandData, 3> BFIOps;
+  uint32_t Disjunction = 0;
+
+  // Check if this is an AND with a constant operand.
+  // Check if the mask from the constant operand fulfills several 
+  // conditions and update the total mask as well as saving the AND 
+  // node as candidate.
+  auto CreateBFIOpFromAnd = [&](SDNode *And) -> bool {
+    if (And->getOpcode() != ISD::AND)
+      return false;
+
+    ConstantSDNode *Base = dyn_cast<ConstantSDNode>(And->getOperand(0));
+    if (!Base)
+      Base = dyn_cast <ConstantSDNode>(And->getOperand(1));
+    if (!Base)
+        return false;
+
+    const uint32_t Mask = Base->getZExtValue();
+
+    // The masks need to be disjoint
+    if ((Disjunction & Mask) != 0)
+      return false;
+
+    Disjunction |= Mask;
+    BFIOps.push_back({And, Mask});
+    return true;
+  };
+
+  // We traverse through the OR operands of OR (OR, AND) instructions 
+  // here, until a OR (AND, AND) instruction is reached. During that 
+  // process, the constants masks from the AND instructions are gathered.
+  SDNode *CurrentOr = N;
+  while (true) {
+    SDValue O0 = CurrentOr->getOperand(0);
+    SDValue O1 = CurrentOr->getOperand(1);
+    // All investigated OR and AND instructions except the outer 
+    // one will be optimized away on success. They are not allowed 
+    // to be used more than once (which is, in the BFI pattern).
+    if (!O0.hasOneUse() || !O1.hasOneUse())
+        return false;
+
+    // One of the operands is required to be an OR instruction.
+    SDNode *NewOr = nullptr;
+    SDNode *Other = nullptr;
+    if (O0.getOpcode() == ISD::OR) {
+      NewOr = O0.getNode();
+      Other = O1.getNode();
+    } else if (O1.getOpcode() == ISD::OR) {
+      NewOr = O1.getNode();
+      Other = O0.getNode();
+    }
+
+    // Try inserting the AND operand as candidate.
+    // Traverse down further.
+    if (NewOr && Other && CreateBFIOpFromAnd(Other)) {
+      CurrentOr = NewOr;
+      continue;
+    }
+
+    // If we visited at least one level of instructions, check 
+    // if we reached an OR with two AND operands.
+    // Try inserting both operands, if successful, we reached the 
+    // (X_i & C_i) | (X_j & C_j) part. Abort the loop.
+    if (CurrentOr != N && CreateBFIOpFromAnd(O0.getNode()) &&
+        CreateBFIOpFromAnd(O1.getNode())) {
+      break;
+    }
+
+    return false;
+  }
+
+  // Require that at least two nesting levels have been visited.
+  // The constants are required to be a partition of -1.
+  if (CurrentOr == N || Disjunction != 0xFFFFFFFF)
+    return false;
+
+  // Transform the operands node by node and replace the original node.
+  // Assume the last node is the base for BFI insertion.
+  SDNode *NewNode = BFIOps.back().X->getOperand(0).getNode();
+
+  assert(NewNode && "Cannot use non-existing node as base!");
+  for (auto &BFITuple : reverse(makeArrayRef(BFIOps).drop_back())) {
+    NewNode = CurDAG->getMachineNode(
+        AMDGPU::V_BFI_B32_e64, SDLoc(N), N->getValueType(0),
+        {BFITuple.X->getOperand(1), BFITuple.X->getOperand(0),
+         SDValue(NewNode, 0)});
+  }
+
+  ReplaceNode(N, NewNode);
+
+  return true;
+}
+
 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
   switch (N->getOpcode()) {
   case ISD::AND:
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -15,17 +15,15 @@
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xc00003ff, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffc00, v1, v2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x3ff00000, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -44,6 +42,38 @@
   ret float %result
 }
 
+define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, float %z) {
+; GFX10-LABEL: v_bfi_single_nesting_level_swapped_operands:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffc00, v1, v2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x3ff00000, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
+  %mul.base.i32 = fptoui float %mul.base to i32
+  %y.i32 = fptoui float %y to i32
+  %shl.inner.insert = shl i32 %y.i32, 10
+  %bfi1.and = and i32 1047552, %shl.inner.insert
+  %bfi1.andnot = and i32 -1073740801, %mul.base.i32
+  %bfi1.or = or i32 %bfi1.and, %bfi1.andnot
+  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
+  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
+  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
+  %and.outer = and i32 %shl.outer.insert, 1072693248
+  %or.outer = or i32 %and.outer, %bfi1.or
+  %result = bitcast i32 %or.outer to float
+  ret float %result
+}
+
 define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z) {
 ; GFX10-LABEL: v_bfi_single_nesting_level_inner_use:
 ; GFX10:       ; %bb.0: ; %.entry
@@ -114,18 +144,16 @@
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 5, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_bfi_b32 v2, 0x3e0, v3, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX10-NEXT:    v_and_or_b32 v2, 0x3e0, v3, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX10-NEXT:    v_or3_b32 v0, v2, v1, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffc00, v1, v2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x3ff00000, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %y.i32 = fptoui float %y to i32