diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2874,10 +2874,17 @@
   return false;
 }
 
+// Given an 'ISD::OR' node that is going to be selected as BFM, analyze
+// the operands and select it to AArch64::ORR with shifted registers if
+// that's more efficient. Returns true iff selection to AArch64::ORR happens.
 static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
                             SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
                             const bool BiggerPattern) {
   EVT VT = N->getValueType(0);
+  assert(N->getOpcode() == ISD::OR && "Expect N to be an OR node");
+  assert(((N->getOperand(0) == OrOpd0 && N->getOperand(1) == OrOpd1) ||
+          (N->getOperand(1) == OrOpd0 && N->getOperand(0) == OrOpd1)) &&
+         "Expect OrOpd0 and OrOpd1 to be operands of ISD::OR");
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "Expect result type to be i32 or i64 since N is combinable to BFM");
   SDLoc DL(N);
@@ -2886,6 +2893,7 @@
   if (OrOpd1 != Dst)
     return false;
 
+  const unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
   // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
   // nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
   if (BiggerPattern) {
@@ -2902,7 +2910,6 @@
       uint64_t ShiftAmount;
       if (isWorthFoldingIntoOrrWithShift(Dst, CurDAG, ShiftedOperand,
                                          ShiftAmount)) {
-        unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
         SDValue Ops[] = {OrOpd0, ShiftedOperand,
                          CurDAG->getTargetConstant(ShiftAmount, DL, VT)};
         CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
@@ -2914,13 +2921,53 @@
 
   assert((!BiggerPattern) && "BiggerPattern should be handled above");
 
+  SDValue Op;
   uint64_t ShlImm;
-  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) &&
-      OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
-    unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
-    SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ShlImm, DL, VT)};
-    CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
-    return true;
+  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm)) {
+    if (OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
+      SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ShlImm, DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
+
+    // Select the following pattern to left-shifted operand rather than BFI.
+    // %val1 = op ..
+    // %val2 = shl %val1, #imm
+    // %res = or %val1, %val2
+    //
+    // If N is selected to be BFI, we know that
+    // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
+    // BFI) 2) OrOpd1 would be the destination operand (i.e., preserved)
+    //
+    // Instead of selecting N to BFI, fold OrOpd0 as a left shift directly.
+    if (OrOpd0.getOperand(0) == OrOpd1) {
+      SDValue Ops[] = {OrOpd1, OrOpd1,
+                       CurDAG->getTargetConstant(ShlImm, DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
+  }
+
+  uint64_t SrlImm;
+  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SRL, SrlImm)) {
+    // Select the following pattern to right-shifted operand rather than BFXIL.
+    // %val1 = op ..
+    // %val2 = lshr %val1, #imm
+    // %res = or %val1, %val2
+    //
+    // If N is selected to be BFXIL, we know that
+    // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
+    // BFXIL) 2) OrOpd1 would be the destination operand (i.e., preserved)
+    //
+    // Instead of selecting N to BFXIL, fold OrOpd0 as a right shift directly.
+    if (OrOpd0.getOperand(0) == OrOpd1) {
+      SDValue Ops[] = {
+          OrOpd1, OrOpd1,
+          CurDAG->getTargetConstant(
+              AArch64_AM::getShifterImm(AArch64_AM::LSR, SrlImm), DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
   }
 
   return false;
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -643,8 +643,7 @@
 ; CHECK-LABEL: test_orr_not_bfi_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and x8, x0, #0xff
-; CHECK-NEXT:    bfi x8, x0, #8, #8
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    orr x0, x8, x8, lsl #8
 ; CHECK-NEXT:    ret
   %2 = and i64 %0, 255
   %3 = shl i64 %2, 8
@@ -674,8 +673,7 @@
 ; CHECK-LABEL: test_orr_not_bfi_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    bfi w8, w0, #8, #8
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w8, lsl #8
 ; CHECK-NEXT:    ret
   %2 = and i32 %0, 255
   %3 = shl i32 %2, 8
@@ -704,8 +702,7 @@
 ; CHECK-LABEL: orr_not_bfxil_test2_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and x8, x0, #0xff000
-; CHECK-NEXT:    bfxil x8, x0, #12, #8
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    orr x0, x8, x8, lsr #12
 ; CHECK-NEXT:    ret
   %2 = and i64 %0, 1044480 ; 0xff000
   %3 = lshr i64 %2, 12
@@ -734,8 +731,7 @@
 ; CHECK-LABEL: orr_not_bfxil_test2_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff000
-; CHECK-NEXT:    bfxil w8, w0, #12, #8
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w8, lsr #12
 ; CHECK-NEXT:    ret
   %2 = and i32 %0, 1044480  ; 0xff000
   %3 = lshr i32 %2, 12