Index: lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
===================================================================
--- lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1178,7 +1178,7 @@
   case PPC::RLWINMbm:
   case PPC::RLWINMobm: {
     unsigned MB, ME;
-    int64_t BM = Inst.getOperand(3).getImm();
+    unsigned BM = Inst.getOperand(3).getImm();
     if (!isRunOfOnes(BM, MB, ME))
       break;
 
@@ -1195,7 +1195,7 @@
   case PPC::RLWIMIbm:
   case PPC::RLWIMIobm: {
     unsigned MB, ME;
-    int64_t BM = Inst.getOperand(3).getImm();
+    unsigned BM = Inst.getOperand(3).getImm();
     if (!isRunOfOnes(BM, MB, ME))
       break;
 
@@ -1213,7 +1213,7 @@
   case PPC::RLWNMbm:
   case PPC::RLWNMobm: {
     unsigned MB, ME;
-    int64_t BM = Inst.getOperand(3).getImm();
+    unsigned BM = Inst.getOperand(3).getImm();
     if (!isRunOfOnes(BM, MB, ME))
       break;
 
Index: lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
===================================================================
--- lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -58,11 +58,23 @@
 /// 0s on either side.  The 1s are allowed to wrap from LSB to MSB, so
 /// 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is not,
 /// since all 1s are not contiguous.
-static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+/// So far, isRunOfOnes supports only 32-bit and 64-bit unsigned integer types.
+template <typename T>
+static inline bool isRunOfOnes(T Val, unsigned &MB, unsigned &ME) {
+  static_assert(std::numeric_limits<T>::is_integer &&
+                !std::numeric_limits<T>::is_signed &&
+                (std::numeric_limits<T>::digits == 32 ||
+                 std::numeric_limits<T>::digits == 64),
+                "isRunOfOnes supports only 32-bit and 64-bit unsigned integer");
+
   if (!Val)
     return false;
 
-  if (isShiftedMask_32(Val)) {
+  const bool Is64Bit = (std::numeric_limits<T>::digits == 64);
+
+  bool IsShiftedMask = Is64Bit ? isShiftedMask_64(Val):
+                                 isShiftedMask_32(Val);
+ if (IsShiftedMask) {
     // look for the first non-zero bit
     MB = countLeadingZeros(Val);
     // look for the first zero bit after the run of ones
@@ -70,7 +82,9 @@
     return true;
   } else {
     Val = ~Val; // invert mask
-    if (isShiftedMask_32(Val)) {
+    IsShiftedMask = Is64Bit ? isShiftedMask_64(Val):
+                              isShiftedMask_32(Val);
+    if (IsShiftedMask) {
       // effectively look for the first zero bit
       ME = countLeadingZeros(Val) - 1;
       // effectively look for the first one bit after the run of zeros
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -169,6 +169,11 @@
     bool tryBitfieldInsert(SDNode *N);
     bool tryBitPermutation(SDNode *N);
 
+    /// tryRotateThenMaskInsert - Returns true if N is replaced by
+    /// RLDIMI/RLWIMI instruction.
+    bool tryRotateThenMaskInsert32(SDNode *N);
+    bool tryRotateThenMaskInsert64(SDNode *N);
+
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
     SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -464,6 +469,12 @@
   return isInt32Immediate(N.getNode(), Imm);
 }
 
+/// isInt64Immediate - This method tests to see if the value is a 64-bit
+/// constant operand. If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDValue N, uint64_t &Imm) {
+  return isInt64Immediate(N.getNode(), Imm);
+}
+
 static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
                               const SDValue &DestMBB) {
   assert(isa<BasicBlockSDNode>(DestMBB));
@@ -577,9 +588,41 @@
   return false;
 }
 
+/// Find a subtree generated for bitfield insert and convert it with
+/// a rotate left then mask insert instruction.
+bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
+  // Expected nodes
+  //   %and1 = and i64 %val1, MASK
+  //   %and2 = and i64 %val2, ~MASK
+  //   %N = or i64 %and1, %and2
+  if (N->getOpcode() != ISD::OR)
+    return false;
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0->getOpcode() != ISD::AND || Op1->getOpcode() != ISD::AND)
+    return false;
+
+  if (N->getValueType(0) == MVT::i32) {
+    unsigned Mask1, Mask2;
+    if (isInt32Immediate(Op0->getOperand(1), Mask1) &&
+        isInt32Immediate(Op1->getOperand(1), Mask2) &&
+        Mask1 == ~Mask2)
+      return tryRotateThenMaskInsert32(N);
+  }
+  if (N->getValueType(0) == MVT::i64) {
+    uint64_t Mask1, Mask2;
+    if (isInt64Immediate(Op0->getOperand(1), Mask1) &&
+        isInt64Immediate(Op1->getOperand(1), Mask2) &&
+        Mask1 == ~Mask2)
+      return tryRotateThenMaskInsert64(N);
+  }
+  return false;
+}
+
 /// Turn an or of two masked values into the rotate left word immediate then
 /// mask insert (rlwimi) instruction.
-bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
+bool PPCDAGToDAGISel::tryRotateThenMaskInsert32(SDNode *N) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDLoc dl(N);
@@ -594,7 +637,7 @@
   if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
     unsigned Op0Opc = Op0.getOpcode();
     unsigned Op1Opc = Op1.getOpcode();
-    unsigned Value, SH = 0;
+    unsigned Value = 0, SH = 0;
     TargetMask = ~TargetMask;
     InsertMask = ~InsertMask;
 
@@ -621,12 +664,11 @@
 
     unsigned MB, ME;
     if (isRunOfOnes(InsertMask, MB, ME)) {
-      SDValue Tmp1, Tmp2;
-
       if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
           isInt32Immediate(Op1.getOperand(1), Value)) {
         Op1 = Op1.getOperand(0);
         SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+        Op1Opc = Op1.getOpcode();
       }
       if (Op1Opc == ISD::AND) {
        // The AND mask might not be a constant, and we need to make sure that
@@ -643,10 +685,20 @@
           // otherwise there would not be any bits set in InsertMask.
           Op1 = Op1.getOperand(0).getOperand(0);
           SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+          Op1Opc = Op1.getOpcode();
         }
       }
 
       SH &= 31;
+
+      // We eliminate AND instructions if they are already folded into rlwimi.
+      if (Op1Opc == ISD::AND && isInt32Immediate(Op1.getOperand(1), Value) &&
+          (Value << SH) == InsertMask)
+        Op1 = Op1.getOperand(0);
+      if (Op0Opc == ISD::AND && isInt32Immediate(Op0.getOperand(1), Value) &&
+          Value == ~InsertMask)
+        Op0 = Op0.getOperand(0);
+
       SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl),
                           getI32Imm(ME, dl) };
       ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
@@ -656,6 +708,101 @@
   return false;
 }
 
+/// Turn an or of two masked values into the rotate left double word immediate
+/// then mask insert (rldimi) instruction.
+bool PPCDAGToDAGISel::tryRotateThenMaskInsert64(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  KnownBits LKnown, RKnown;
+  CurDAG->computeKnownBits(Op0, LKnown);
+  CurDAG->computeKnownBits(Op1, RKnown);
+
+  uint64_t TargetMask = LKnown.Zero.getZExtValue();
+  uint64_t InsertMask = RKnown.Zero.getZExtValue();
+
+  // If all bits come from two source registers,
+  // we can use rldimi instruction.
+  if ((TargetMask | InsertMask) == ~0ULL) {
+    uint64_t Op0Opc = Op0.getOpcode();
+    uint64_t Op1Opc = Op1.getOpcode();
+    uint64_t Value64 = 0;
+    unsigned Value = 0, SH = 0;
+    TargetMask = ~TargetMask;
+    InsertMask = ~InsertMask;
+
+    // If the LHS has a foldable shift and the RHS does not, then swap it to the
+    // RHS so that we can fold the shift into the insert.
+    if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+      if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+          Op0.getOperand(0).getOpcode() == ISD::SRL) {
+        if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+            Op1.getOperand(0).getOpcode() != ISD::SRL) {
+          std::swap(Op0, Op1);
+          std::swap(Op0Opc, Op1Opc);
+          std::swap(TargetMask, InsertMask);
+        }
+      }
+    } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+      if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+          Op1.getOperand(0).getOpcode() != ISD::SRL) {
+        std::swap(Op0, Op1);
+        std::swap(Op0Opc, Op1Opc);
+        std::swap(TargetMask, InsertMask);
+      }
+    }
+
+    unsigned MB, ME;
+    if (isRunOfOnes(InsertMask, MB, ME)) {
+      // to opt: We will have additional opportunity to use rldimi
+      // by supporting patterns with ZERO_EXTEND in Op0Opc or Op1Opc
+      if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+          isInt32Immediate(Op1.getOperand(1), Value)) {
+        Op1 = Op1.getOperand(0);
+        SH  = (Op1Opc == ISD::SHL) ? Value : 64 - Value;
+        Op1Opc = Op1.getOpcode();
+      }
+      if (Op1Opc == ISD::AND) {
+       // The AND mask might not be a constant, and we need to make sure that
+       // if we're going to fold the masking with the insert, all bits not
+       // know to be zero in the mask are known to be one.
+        KnownBits MKnown;
+        CurDAG->computeKnownBits(Op1.getOperand(1), MKnown);
+        bool CanFoldMask = InsertMask == MKnown.One.getZExtValue();
+
+        unsigned SHOpc = Op1.getOperand(0).getOpcode();
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
+            isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+          // Note that Value must be in range here (less than 32) because
+          // otherwise there would not be any bits set in InsertMask.
+          Op1 = Op1.getOperand(0).getOperand(0);
+          SH  = (SHOpc == ISD::SHL) ? Value : 64 - Value;
+          Op1Opc = Op1.getOpcode();
+        }
+      }
+
+      SH &= 63;
+
+      // We cannot specify ME for rldimi; ~SH is used instead.
+      if (ME == 63 - SH) {
+        // We eliminate AND instructions if they are already folded into rldimi.
+        if (Op1Opc == ISD::AND && isInt64Immediate(Op1.getOperand(1), Value64) &&
+            (Value64 << SH) == InsertMask)
+          Op1 = Op1.getOperand(0);
+        if (Op0Opc == ISD::AND && isInt64Immediate(Op0.getOperand(1), Value64) &&
+            Value64 == ~InsertMask)
+          Op0 = Op0.getOperand(0);
+
+        SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl) };
+        ReplaceNode(N, CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops));
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 // Predict the number of instructions that would be generated by calling
 // getInt64(N).
 static unsigned getInt64CountDirect(int64_t Imm) {
@@ -3035,7 +3182,14 @@
       N->getOperand(1).getOpcode() == ISD::TargetConstant)
     llvm_unreachable("Invalid ADD with TargetConstant operand");
 
-  // Try matching complex bit permutations before doing anything else.
+  // Find opportunity to use rotate left immediate then mask insert instruction
+  // for a simple bitfield insert, i.e. (or (and %a, MASK) (and %b, ~MASK))
+  // before tryBitPermutation, which may generate suboptimal machine IR;
+  // Leave more complicated cases for tryBitPermutation.
+  if (tryBitfieldInsert(N))
+    return;
+
+  // Try matching complex bit permutations next.
   if (tryBitPermutation(N))
     return;
 
@@ -3316,9 +3470,9 @@
     break;
   }
   case ISD::OR: {
-    if (N->getValueType(0) == MVT::i32)
-      if (tryBitfieldInsert(N))
-        return;
+    if ((N->getValueType(0) == MVT::i32 && tryRotateThenMaskInsert32(N)) ||
+        (N->getValueType(0) == MVT::i64 && tryRotateThenMaskInsert64(N)))
+      return;
 
     if (tryLogicOpOfCompares(N))
       return;
Index: test/CodeGen/PowerPC/addi-offset-fold.ll
===================================================================
--- test/CodeGen/PowerPC/addi-offset-fold.ll
+++ test/CodeGen/PowerPC/addi-offset-fold.ll
@@ -30,9 +30,8 @@
 ; CHECK: ori 2, 2, 0
 ; CHECK-DAG: lbz [[REG1:[0-9]+]], -16(1)
 ; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1)
-; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG1]], 32
-; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]]
-; CHECK: rldicl 3, [[REG4]], 33, 57
+; CHECK-DAG: rldimi [[REG3:[0-9]+]], [[REG1]], 32, 24
+; CHECK: rldicl 3, [[REG3]], 33, 57
 ; CHECK: blr
 }
 
Index: test/CodeGen/PowerPC/bitfieldinsert.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/bitfieldinsert.ll
@@ -0,0 +1,63 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+; bitfieldinsert64: Test for rldimi
+; equivalent C code
+;   struct s64 {
+;   	int a:5;
+;   	int b:16;
+;   	long c:42;
+;   };
+;   void bitfieldinsert64(struct s *p, unsigned short v) {
+;   	p->b = v;
+;   }
+
+%struct.s64 = type { i64 }
+
+define void @bitfieldinsert64(%struct.s64* nocapture %p, i16 zeroext %v) {
+; CHECK-LABEL: @bitfieldinsert64
+; CHECK: ld [[REG1:[0-9]+]], 0(3)
+; CHECK: rldimi [[REG1]], 4, 5, 43
+; CHECK: std [[REG1]], 0(3)
+; CHECK: blr
+entry:
+  %0 = getelementptr inbounds %struct.s64, %struct.s64* %p, i64 0, i32 0
+  %1 = zext i16 %v to i64
+  %bf.load = load i64, i64* %0, align 8
+  %bf.shl = shl nuw nsw i64 %1, 5
+  %bf.clear = and i64 %bf.load, -2097121
+  %bf.set = or i64 %bf.clear, %bf.shl
+  store i64 %bf.set, i64* %0, align 8
+  ret void
+}
+
+; bitfieldinsert32: Test for rlwimi
+; equivalent C code
+;   struct s32 {
+;   	int a:8;
+;   	int b:16;
+;   	int c:8;
+;   };
+;   void bitfieldinsert32(struct s32 *p, unsigned int v) {
+;   	p->b = v;
+;   }
+
+%struct.s32 = type { i32 }
+
+define void @bitfieldinsert32(%struct.s32* nocapture %p, i32 zeroext %v) {
+; CHECK-LABEL: @bitfieldinsert32
+; CHECK: lwz [[REG1:[0-9]+]], 0(3)
+; CHECK: rlwimi [[REG1]], 4, 8, 8, 23
+; CHECK: stw [[REG1]], 0(3)
+; CHECK: blr
+entry:
+  %0 = getelementptr inbounds %struct.s32, %struct.s32* %p, i64 0, i32 0
+  %bf.load = load i32, i32* %0, align 4
+  %bf.value = shl i32 %v, 8
+  %bf.shl = and i32 %bf.value, 16776960
+  %bf.clear = and i32 %bf.load, -16776961
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* %0, align 4
+  ret void
+}
+
Index: test/CodeGen/PowerPC/ppc64le-aggregates.ll
===================================================================
--- test/CodeGen/PowerPC/ppc64le-aggregates.ll
+++ test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -236,14 +236,12 @@
 ; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
 ; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
 ; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
-; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
+; CHECK-DAG: lwz 9, [[OFF0]](1)
 ; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
-; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
+; CHECK-DAG: lwz 10, [[OFF2]](1)
 ; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
-; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
-; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
-; CHECK-DAG: or 9, [[REG0]], [[REG1]]
-; CHECK-DAG: or 10, [[REG2]], [[REG3]]
+; CHECK-DAG: rldimi 9, [[REG1]], 32, 0
+; CHECK-DAG: rldimi 10, [[REG3]], 32, 0
 ; CHECK: bl test1
 
 declare void @test1([8 x float], [8 x float])