Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -2071,6 +2071,15 @@
                             APInt &KnownZero, APInt &KnownOne,
                             TargetLoweringOpt &TLO, unsigned Depth = 0) const;
 
+  /// Do a target-specific optimization of Op's constant immediate operand using
+  /// the demanded bits information. For example, targets can sign-extend the
+  /// immediate in order to have it folded into the immediate field of the
+  /// instruction. Return true if the immediate node doesn't need further
+  /// optimization and set the New and Old fields of TLO if a new immediate
+  /// node was created.
+  virtual bool optimizeConstant(SDValue Op, const APInt &Demanded,
+                                TargetLoweringOpt &TLO) const;
+
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets.
   virtual void computeKnownBitsForTargetNode(const SDValue Op,
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3424,11 +3424,11 @@
   if (ROR.getNode())
     return ROR;
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
-  // iff (c1 & c2) == 0.
+  // iff (c1 & c2) == c1.
   if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
              isa<ConstantSDNode>(N0.getOperand(1))) {
     ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
-    if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) {
+    if ((C1->getAPIntValue() & N1C->getAPIntValue()) == C1->getAPIntValue()) {
       SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1);
       if (!COR.getNode())
         return SDValue();
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -284,6 +284,21 @@
                                                         const APInt &Demanded) {
   SDLoc dl(Op);
 
+  if (Op.getOpcode() == ISD::XOR) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+    // If an XOR already has all the bits set, nothing to change but don't
+    // shrink either.
+    if (!C || (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+      return false;
+  }
+
+  assert(!Old.getNode() && !New.getNode());
+
+  // Return if the constant operand doesn't need further optimization.
+  if (DAG.getTargetLoweringInfo().optimizeConstant(Op, Demanded, *this))
+    return New.getNode();
+
   // FIXME: ISD::SELECT, ISD::SELECT_CC
   switch (Op.getOpcode()) {
   default: break;
@@ -293,10 +308,6 @@
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     if (!C) return false;
 
-    if (Op.getOpcode() == ISD::XOR &&
-        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
-      return false;
-
     // if we can expand it to have all bits set, do it
     if (C->getAPIntValue().intersects(~Demanded)) {
       EVT VT = Op.getValueType();
@@ -1082,6 +1093,11 @@
   return false;
 }
 
+bool TargetLowering::optimizeConstant(SDValue Op, const APInt &Demanded,
+                                      TargetLoweringOpt &TLO) const {
+  return false;
+}
+
 /// computeKnownBitsForTargetNode - Determine which of the bits specified
 /// in Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -212,6 +212,9 @@
   /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
+  bool optimizeConstant(SDValue Op, const APInt &Demanded,
+                        TargetLoweringOpt &TLO) const override;
+
   /// computeKnownBitsForTargetNode - Determine which of the bits specified in
   /// Mask are known to be either zero or one and return them in the
   /// KnownZero/KnownOne bitsets.
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -37,6 +37,7 @@
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 
 namespace {
 enum AlignMode {
@@ -664,6 +665,129 @@
   return VT.changeVectorElementTypeToInteger();
 }
 
+static bool optimizeConstant(unsigned Size, uint64_t Imm, uint64_t &NewImm,
+                             const APInt &Demanded) {
+  uint64_t Enc;
+  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size));
+
+  // Return if the immediate is already a bimm32 or bimm64.
+  if (AArch64_AM::processLogicalImmediate(Imm & Mask, Size, Enc)) {
+    NewImm = Imm;
+    return true;
+  }
+
+  unsigned EltSize = Size;
+  uint64_t DemandedBits = Demanded.getZExtValue();
+
+  // Clear bits that are not demanded.
+  Imm &= DemandedBits;
+
+  while (true) {
+    // The goal here is to set the non-demanded bits in a way that minimizes
+    // the number of switching between 0 and 1. In order to achieve this goal,
+    // we set the non-demanded bits to the value of the preceding demanded bits.
+    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
+    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
+    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
+    // The final result is 0b11000011.
+    uint64_t NonDemandedBits = ~DemandedBits;
+    uint64_t Ones =
+        (NonDemandedBits + ((~Imm & DemandedBits) << 1)) & NonDemandedBits;
+
+    // We cannot copy the preceding demanded bit if the rightmost bits are not
+    // demanded. In that case, we replicate the least significant demanded bit
+    // that follows the trailing non-demanded bits.
+    if (Ones & 1) {
+      bool FirstDemandedBitIsZero = ~Imm & ((Ones + 1 | Ones) ^ Ones) & Mask;
+
+      // If the first demanded bit is zero, clear the trailing ones in Ones.
+      if (FirstDemandedBitIsZero)
+        Ones = (Ones + 1) & Ones;
+    }
+
+    uint64_t TmpImm = (Imm | Ones) & Mask;
+
+    // If TmpImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
+    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
+    // we halve the element size and continue the search.
+    if (isShiftedMask_64(TmpImm) || isShiftedMask_64(~(TmpImm | ~Mask))) {
+      Imm = TmpImm;
+      break;
+    }
+
+    // We cannot shrink the element size any further if it is 2-bits.
+    if (EltSize == 2)
+      return false;
+
+    EltSize /= 2;
+    Mask >>= EltSize;
+    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
+
+    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
+    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
+      return false;
+
+    // Merge the upper and lower halves of Imm and DemandedBits.
+    Imm |= Hi;
+    DemandedBits |= DemandedBitsHi;
+  }
+
+  // Replicate the element across the register width.
+  while (EltSize < Size) {
+    Imm |= Imm << EltSize;
+    EltSize *= 2;
+  }
+
+  NewImm = Imm;
+  return true;
+}
+
+bool AArch64TargetLowering::optimizeConstant(SDValue Op, const APInt &Demanded,
+                                             TargetLoweringOpt &TLO) const {
+  // Delay this optimization to as late as possible.
+  if (!TLO.LegalOps)
+    return false;
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!C)
+      break;
+
+    EVT VT = Op.getValueType();
+    if (VT.isVector())
+      break;
+
+    unsigned Size = VT.getSizeInBits();
+    assert((Size == 32 || Size == 64) &&
+           "i32 or i64 is expected after legalization.");
+    uint64_t Imm = C->getZExtValue(), NewImm;
+
+    // Break if we cannot turn Imm into a bitmask immediate.
+    if (!::optimizeConstant(Size, Imm, NewImm, Demanded))
+      break;
+
+    // Return if Imm was already a bitmask immediate.
+    if (Imm == NewImm)
+      return true;
+
+    ++NumOptimizedImms;
+    assert(((NewImm ^ Imm) & Demanded.getZExtValue()) == 0 &&
+           "demanded bits should never be altered");
+
+    // Create the new constant immediate node.
+    SDValue New = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
+                                  Op.getOperand(0),
+                                  TLO.DAG.getConstant(NewImm, VT));
+    return TLO.CombineTo(Op, New);
+  }
+
+  return false;
+}
+
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
Index: test/CodeGen/AArch64/optimize-imm.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/optimize-imm.ll
@@ -0,0 +1,64 @@
+; RUN: llc -o - %s -march=aarch64 | FileCheck %s
+
+; CHECK-LABEL: _and1:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd
+
+define void @and1(i32 %a, i8* nocapture %p) {
+entry:
+  %and = and i32 %a, 253
+  %conv = trunc i32 %and to i8
+  store i8 %conv, i8* %p, align 1
+  ret void
+}
+
+; (a & 0x3dfd) | 0xffffc000
+;
+; CHECK-LABEL: _and2:
+; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd
+
+define i32 @and2(i32 %a) {
+entry:
+  %and = and i32 %a, 15869
+  %or = or i32 %and, -16384
+  ret i32 %or
+}
+
+; (a & 0x19) | 0xffffffc0
+;
+; CHECK-LABEL: _and3:
+; CHECK: and {{w[0-9]+}}, w0, #0x99999999
+
+define i32 @and3(i32 %a) {
+entry:
+  %and = and i32 %a, 25
+  %or = or i32 %and, -64
+  ret i32 %or
+}
+
+; (a & 0xc5600) | 0xfff1f1ff
+;
+; CHECK-LABEL: _and4:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff
+
+define i32 @and4(i32 %a) {
+entry:
+  %and = and i32 %a, 787968
+  %or = or i32 %and, -921089
+  ret i32 %or
+}
+
+; Make sure we don't shrink or optimize an XOR's immediate operand if the
+; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into
+; a BIC.
+
+; CHECK-LABEL: _xor1:
+; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38
+; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3
+
+define i32 @xor1(i32 %a) {
+entry:
+  %shl = shl i32 %a, 3
+  %xor = and i32 %shl, 56
+  %and = xor i32 %xor, 56
+  ret i32 %and
+}