diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2875,10 +2875,17 @@
   return false;
 }
 
+// Given an 'ISD::OR' node that is going to be selected as BFM, analyze
+// the operands and select it to AArch64::ORR with shifted registers if
+// that's more efficient. Returns true iff selection to AArch64::ORR happens.
 static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
                             SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
                             const bool BiggerPattern) {
   EVT VT = N->getValueType(0);
+  assert(N->getOpcode() == ISD::OR && "Expect N to be an OR node");
+  assert(((N->getOperand(0) == OrOpd0 && N->getOperand(1) == OrOpd1) ||
+          (N->getOperand(1) == OrOpd0 && N->getOperand(0) == OrOpd1)) &&
+         "Expect OrOpd0 and OrOpd1 to be operands of ISD::OR");
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "Expect result type to be i32 or i64 since N is combinable to BFM");
   SDLoc DL(N);
@@ -2887,6 +2894,7 @@
   if (OrOpd1 != Dst)
     return false;
 
+  const unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
   // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
   // nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
   if (BiggerPattern) {
@@ -2903,7 +2911,6 @@
       uint64_t EncodedShiftImm;
       if (isWorthFoldingIntoOrrWithShift(Dst, CurDAG, ShiftedOperand,
                                          EncodedShiftImm)) {
-        unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
         SDValue Ops[] = {OrOpd0, ShiftedOperand,
                          CurDAG->getTargetConstant(EncodedShiftImm, DL, VT)};
         CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
@@ -2915,16 +2922,58 @@
 
   assert((!BiggerPattern) && "BiggerPattern should be handled above");
 
+  SDValue Op;
   uint64_t ShlImm;
-  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) &&
-      OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
-    unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
-    SDValue Ops[] = {
-        Dst, Src,
-        CurDAG->getTargetConstant(
-            AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)};
-    CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
-    return true;
+  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm)) {
+    if (OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
+      SDValue Ops[] = {
+          Dst, Src,
+          CurDAG->getTargetConstant(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
+
+    // Select the following pattern to left-shifted operand rather than BFI.
+    // %val1 = op ..
+    // %val2 = shl %val1, #imm
+    // %res = or %val1, %val2
+    //
+    // If N is selected to be BFI, we know that
+    // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
+    // BFI) 2) OrOpd1 would be the destination operand (i.e., preserved)
+    //
+    // Instead of selecting N to BFI, fold OrOpd0 as a left shift directly.
+    if (OrOpd0.getOperand(0) == OrOpd1) {
+      SDValue Ops[] = {
+          OrOpd1, OrOpd1,
+          CurDAG->getTargetConstant(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
+  }
+
+  uint64_t SrlImm;
+  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SRL, SrlImm)) {
+    // Select the following pattern to right-shifted operand rather than BFXIL.
+    // %val1 = op ..
+    // %val2 = lshr %val1, #imm
+    // %res = or %val1, %val2
+    //
+    // If N is selected to be BFXIL, we know that
+    // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
+    // BFXIL) 2) OrOpd1 would be the destination operand (i.e., preserved)
+    //
+    // Instead of selecting N to BFXIL, fold OrOpd0 as a right shift directly.
+    if (OrOpd0.getOperand(0) == OrOpd1) {
+      SDValue Ops[] = {
+          OrOpd1, OrOpd1,
+          CurDAG->getTargetConstant(
+              AArch64_AM::getShifterImm(AArch64_AM::LSR, SrlImm), DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
   }
 
   return false;
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -638,13 +638,12 @@
 }
 
 ; For or operation, one operand is a left shift of another operand.
-; Use orr with left-shifted operand is better than bfi.
+; So orr with a left-shifted operand is generated (not bfi).
 define i64 @test_orr_not_bfi_i64(i64 %0) {
 ; CHECK-LABEL: test_orr_not_bfi_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and x8, x0, #0xff
-; CHECK-NEXT:    bfi x8, x0, #8, #8
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    orr x0, x8, x8, lsl #8
 ; CHECK-NEXT:    ret
   %2 = and i64 %0, 255
   %3 = shl i64 %2, 8
@@ -668,14 +667,13 @@
   ret i32 %or_res
 }
 
-; orr is better than bfi, since both simplify away one instruction (%3)
+; orr is generated (not bfi), since both simplify away one instruction (%3)
 ; while orr has shorter latency and higher throughput.
 define i32 @test_orr_not_bfi_i32(i32 %0) {
 ; CHECK-LABEL: test_orr_not_bfi_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    bfi w8, w0, #8, #8
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w8, lsl #8
 ; CHECK-NEXT:    ret
   %2 = and i32 %0, 255
   %3 = shl i32 %2, 8
@@ -698,14 +696,13 @@
   ret i64 %or_res
 }
 
-; orr is better than bfxil, since one operand is the right shift of  another
+; orr is generated (not bfxil), since one operand is the right shift of another
 ; operand.
 define i64 @orr_not_bfxil_test2_i64(i64 %0) {
 ; CHECK-LABEL: orr_not_bfxil_test2_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and x8, x0, #0xff000
-; CHECK-NEXT:    bfxil x8, x0, #12, #8
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    orr x0, x8, x8, lsr #12
 ; CHECK-NEXT:    ret
   %2 = and i64 %0, 1044480 ; 0xff000
   %3 = lshr i64 %2, 12
@@ -729,13 +726,12 @@
   ret i32 %or_res
 }
 
-; one operand is the shift of another operand, so orr is better.
+; one operand is the shift of another operand, so orr is generated (not bfxil).
 define i32 @orr_not_bfxil_test2_i32(i32 %0) {
 ; CHECK-LABEL: orr_not_bfxil_test2_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff000
-; CHECK-NEXT:    bfxil w8, w0, #12, #8
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w8, lsr #12
 ; CHECK-NEXT:    ret
   %2 = and i32 %0, 1044480  ; 0xff000
   %3 = lshr i32 %2, 12