Index: lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1990,6 +1990,7 @@
   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
 
+  // Optimization #1:
   // Given a OR operation, check if we have the following pattern
   // ubfm c, b, imm, imm2 (or something that does the same jobs, see
   //                       isBitfieldExtractOp)
@@ -2084,7 +2085,87 @@
     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
     return true;
   }
-  return false;
+
+  // Optimization #2:
+  // Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
+  // inserted only sets known zero bits.
+  uint64_t OrImm;
+  if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
+    return false;
+
+  // Skip this transformation if the ORR immediate can be encoded in the ORR.
+  // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
+  // performance neutral.
+  if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
+    return false;
+
+  uint64_t MaskImm;
+  SDValue And = N->getOperand(0);
+  // Must be a single use AND with an immediate operand.
+  if (!And.hasOneUse() ||
+      !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
+    return false;
+
+  // Compute the Known Zero for the AND as this allows us to catch more general
+  // cases than just looking for AND with imm.
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value.
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // The mask can be 0's on either or both "outsides", all the "inside" bits
+  // must be 1's.
+  if ((VT == MVT::i32 && !isShiftedMask_32(~NonZeroBits)) ||
+      (VT == MVT::i64 && !isShiftedMask_64(~NonZeroBits)))
+    return false;
+
+  // The bits being inserted must only set those bits that are known to be zero.
+  if ((OrImm & NonZeroBits) != 0)
+    return false;
+
+  // BFI/BFXIL dst, src, #lsb, #width.
+  int LSB = countTrailingOnes(NonZeroBits);
+  int Width = BitWidth - APInt(BitWidth, NonZeroBits).countPopulation();
+
+  // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
+  unsigned ImmR = (BitWidth - LSB) % BitWidth;
+  unsigned ImmS = Width - 1;
+
+  // If we're creating a BFI instruction avoid cases where we need more
+  // instructions to materialize the BFI constant as compared to the original
+  // ORR.  A BFXIL will use the same constant as the original ORR, so the code
+  // should be no worse in this case.
+  bool IsBFI = LSB != 0;
+  uint64_t BFIImm = OrImm >> LSB;
+  if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
+    // We have a BFI instruction and we know the constant can't be materialized
+    // with a ORR-immediate with the zero register.
+    unsigned OrChunks = 0, BFIChunks = 0;
+    for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
+      if (((OrImm >> Shift) & 0xFFFF) != 0)
+        ++OrChunks;
+      if (((BFIImm >> Shift) & 0xFFFF) != 0)
+        ++BFIChunks;
+    }
+    if (BFIChunks > OrChunks)
+      return false;
+  }
+
+  // Materialize the constant to be inserted.
+  SDLoc DL(N);
+  unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+  SDNode *MOVI = CurDAG->getMachineNode(
+      MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
+
+  // Create the BFI/BFXIL instruction.
+  SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
+                   CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
 }
 
 bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
Index: test/CodeGen/AArch64/bfi.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/bfi.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+; CHECK-LABEL: @test1
+; CHECK: movz w8, #5
+; CHECK: bfxil w0, w8, #0, #4
+define i32 @test1(i32 %a) {
+  %1 = and i32 %a, -16 ; 0xfffffff0
+  %2 = or i32 %1, 5    ; 0x00000005
+  ret i32 %2
+}
+
+; CHECK-LABEL: @test2
+; CHECK: movz w8, #10
+; CHECK: bfi w0, w8, #22, #4
+define i32 @test2(i32 %a) {
+  %1 = and i32 %a, -62914561 ; 0xfc3fffff
+  %2 = or i32 %1, 41943040   ; 0x06400000
+  ret i32 %2
+}
+
+; CHECK-LABEL: @test3
+; CHECK: movz x8, #5
+; CHECK: bfxil x0, x8, #0, #3
+define i64 @test3(i64 %a) {
+  %1 = and i64 %a, -8 ; 0xfffffffffffffff8
+  %2 = or i64 %1, 5   ; 0x0000000000000005
+  ret i64 %2
+}
+
+; CHECK-LABEL: @test4
+; CHECK: movz x8, #9
+; CHECK: bfi x0, x8, #1, #7
+define i64 @test4(i64 %a) {
+  %1 = and i64 %a, -255 ; 0xffffffffffffff01
+  %2 = or i64 %1,  18   ; 0x0000000000000012
+  ret i64 %2
+}
+
+; Don't generate BFI/BFXIL if the immediate can be encoded in the ORR.
+; CHECK-LABEL: @test5
+; CHECK: and w8, w0, #0xfffffff0
+; CHECK: orr w0, w8, #0x6
+define i32 @test5(i32 %a) {
+  %1 = and i32 %a, 4294967280 ; 0xfffffff0
+  %2 = or i32 %1, 6           ; 0x00000006
+  ret i32 %2
+}
+
+; BFXIL will use the same constant as the ORR, so we don't care how the constant
+; is materialized (it's an equal cost either way).
+; CHECK-LABEL: @test6
+; CHECK: movz w8, #11, lsl #16
+; CHECK: movk w8, #23250
+; CHECK: bfxil w0, w8, #0, #20
+define i32 @test6(i32 %a) {
+  %1 = and i32 %a, 4293918720 ; 0xfff00000
+  %2 = or i32 %1, 744146      ; 0x000b5ad2
+  ret i32 %2
+}
+
+; BFIs that require the same number of instruction to materialize the constant
+; as the original ORR are okay.
+; CHECK-LABEL: @test7
+define i32 @test7(i32 %a) {
+  %1 = and i32 %a, 4293918721 ; 0xfff00001
+  %2 = or i32 %1, 744146      ; 0x000b5ad2
+  ret i32 %2
+}
+
+; BFIs that requires more instructions to materialize the constant as compared
+; to the original ORR are not okay.  In this case we would be replacing the
+; 'and' with a 'movk', which would decrease ILP while using the same number of
+; instructions.
+; CHECK: @test8
+; CHECK: movz x9, #36694, lsl #32
+; CHECK: and x8, x0, #0xff000000000000ff
+; CHECK: movk x9, #31059, lsl #16
+; CHECK: orr x0, x8, x9
+define i64 @test8(i64 %a) {
+  %1 = and i64 %a, -72057594037927681 ; 0xff000000000000ff
+  %2 = or i64 %1, 157601565442048     ; 0x00008f5679530000
+  ret i64 %2
+}