Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -2045,6 +2045,14 @@
                                           SDLoc dl, bool doesNotReturn = false,
                                           bool isReturnValueUsed = true) const;
 
+  struct TargetLoweringOpt;
+
+  // Type of call-back function that does target-specific constant optimization.
+  typedef std::function<bool(SDValue, const APInt &, TargetLoweringOpt &)>
+      ConstOptFtorT;
+
+  virtual ConstOptFtorT getConstOptFtor() const { return ConstOptFtorT(); }
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Optimization Methods
   //
@@ -2058,10 +2066,11 @@
     bool LegalOps;
     SDValue Old;
     SDValue New;
+    ConstOptFtorT ConstOptFtor;
 
-    explicit TargetLoweringOpt(SelectionDAG &InDAG,
-                               bool LT, bool LO) :
-      DAG(InDAG), LegalTys(LT), LegalOps(LO) {}
+    explicit TargetLoweringOpt(SelectionDAG &InDAG, bool LT, bool LO,
+                               ConstOptFtorT Ftor = ConstOptFtorT()) :
+        DAG(InDAG), LegalTys(LT), LegalOps(LO), ConstOptFtor(Ftor) {}
 
     bool LegalTypes() const { return LegalTys; }
     bool LegalOperations() const { return LegalOps; }
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -870,7 +870,8 @@
 /// Check the specified integer node value to see if it can be simplified or if
 /// things it uses can be simplified by bit propagation. If so, return true.
 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
-  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations,
+                                        TLI.getConstOptFtor());
   APInt KnownZero, KnownOne;
   if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
     return false;
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -286,6 +286,21 @@
                                                         const APInt &Demanded) {
   SDLoc dl(Op);
 
+  if (Op.getOpcode() == ISD::XOR) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+    // If an XOR already has all the bits set, nothing to change but don't
+    // shrink either.
+    if (!C || (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+      return false;
+  }
+
+  assert(!Old.getNode() && !New.getNode());
+
+  // Call ConstOptFtor and do target-specific constant optimization.
+  if (ConstOptFtor && ConstOptFtor(Op, Demanded, *this))
+    return New.getNode();
+
   // FIXME: ISD::SELECT, ISD::SELECT_CC
   switch (Op.getOpcode()) {
   default: break;
@@ -295,10 +310,6 @@
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     if (!C) return false;
 
-    if (Op.getOpcode() == ISD::XOR &&
-        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
-      return false;
-
     // if we can expand it to have all bits set, do it
     if (C->getAPIntValue().intersects(~Demanded)) {
       EVT VT = Op.getValueType();
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -233,6 +233,8 @@
                                      APInt &KnownOne, const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
+  ConstOptFtorT getConstOptFtor() const override;
+
   MVT getScalarShiftAmountTy(EVT LHSTy) const override;
 
   /// allowsMisalignedMemoryAccesses - Returns true if the target allows
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -38,6 +38,7 @@
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 
 namespace {
 enum AlignMode {
@@ -710,6 +711,143 @@
   return VT.changeVectorElementTypeToInteger();
 }
 
+static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
+                               const APInt &Demanded,
+                               TargetLowering::TargetLoweringOpt &TLO,
+                               unsigned NewOpc) {
+  uint64_t OldImm = Imm, NewImm, Enc;
+  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size));
+
+  // Return if the immediate is already a bimm32 or bimm64.
+  if (AArch64_AM::processLogicalImmediate(Imm & Mask, Size, Enc))
+    return true;
+
+  unsigned EltSize = Size;
+  uint64_t DemandedBits = Demanded.getZExtValue();
+
+  // Clear bits that are not demanded.
+  Imm &= DemandedBits;
+
+  while (true) {
+    // The goal here is to set the non-demanded bits in a way that minimizes
+    // the number of switching between 0 and 1. In order to achieve this goal,
+    // we set the non-demanded bits to the value of the preceding demanded bits.
+    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
+    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
+    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
+    // The final result is 0b11000011.
+    uint64_t NonDemandedBits = ~DemandedBits;
+    uint64_t InvertedImm = ~Imm & DemandedBits;
+    uint64_t RotatedImm =
+        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
+        NonDemandedBits;
+    uint64_t Sum = RotatedImm + NonDemandedBits;
+
+    // There is a carry if the MSBs of NonDemandedBits and Sum are 1 and 0
+    // respectively.
+    uint64_t Carry = ((NonDemandedBits & ~Sum) >> (EltSize - 1)) & 1;
+    uint64_t NonDemandedBitPattern = (Sum + Carry) & NonDemandedBits;
+
+    // Compute the new immediate.
+    NewImm = (Imm | NonDemandedBitPattern) & Mask;
+
+    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
+    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
+    // we halve the element size and continue the search.
+    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
+      break;
+
+    // Don't shrink element size below 2-bits.
+    if (EltSize == 2)
+      return false;
+
+    EltSize /= 2;
+    Mask >>= EltSize;
+    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
+
+    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
+    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
+      return false;
+
+    // Merge the upper and lower halves of Imm and DemandedBits.
+    Imm |= Hi;
+    DemandedBits |= DemandedBitsHi;
+  }
+
+  ++NumOptimizedImms;
+
+  // Replicate the element across the register width.
+  while (EltSize < Size) {
+    NewImm |= NewImm << EltSize;
+    EltSize *= 2;
+  }
+
+  (void)OldImm;
+  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
+         "demanded bits should never be altered");
+
+  // Create the new constant immediate node.
+  EVT VT = Op.getValueType();
+  unsigned Population = countPopulation(NewImm);
+  SDLoc DL(Op);
+
+  // If NewImm is all 0s or all 1s, replace the imm with NewImm and let the
+  // target-independent DAGCombine optimize it further.
+  if (Population == 0 || Population == Size)
+    return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT));
+
+  // Otherwise, emit an aarch64 machine node to prevent the target-independent
+  // DAGCombine from undoing the optimization.
+  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+  SDValue New(
+      TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+
+  return TLO.CombineTo(Op, New);
+}
+
+static bool aarch64ConstOptFunc(SDValue Op, const APInt &Demanded,
+                                TargetLowering::TargetLoweringOpt &TLO) {
+  // Delay this optimization to as late as possible.
+  if (!TLO.LegalOps)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return false;
+
+  assert(Op.getNumOperands() > 1 && "More than one operand expected.");
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+
+  unsigned Size = VT.getSizeInBits();
+  assert((Size == 32 || Size == 64) &&
+         "i32 or i64 is expected after legalization.");
+  uint64_t Imm = C->getZExtValue();
+  unsigned NewOpc;
+
+  switch (Op.getOpcode()) {
+  case ISD::AND:
+    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
+    break;
+  case ISD::OR:
+    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
+    break;
+  case ISD::XOR:
+    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
+    break;
+  default:
+    return false;
+  }
+
+  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+}
+
+TargetLowering::ConstOptFtorT AArch64TargetLowering::getConstOptFtor() const {
+  return aarch64ConstOptFunc;
+}
+
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
Index: test/CodeGen/AArch64/optimize-imm.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/optimize-imm.ll
@@ -0,0 +1,64 @@
+; RUN: llc -o - %s -march=aarch64 | FileCheck %s
+
+; CHECK-LABEL: _and1:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd
+
+define void @and1(i32 %a, i8* nocapture %p) {
+entry:
+  %and = and i32 %a, 253
+  %conv = trunc i32 %and to i8
+  store i8 %conv, i8* %p, align 1
+  ret void
+}
+
+; (a & 0x3dfd) | 0xffffc000
+;
+; CHECK-LABEL: _and2:
+; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd
+
+define i32 @and2(i32 %a) {
+entry:
+  %and = and i32 %a, 15869
+  %or = or i32 %and, -16384
+  ret i32 %or
+}
+
+; (a & 0x19) | 0xffffffc0
+;
+; CHECK-LABEL: _and3:
+; CHECK: and {{w[0-9]+}}, w0, #0x99999999
+
+define i32 @and3(i32 %a) {
+entry:
+  %and = and i32 %a, 25
+  %or = or i32 %and, -64
+  ret i32 %or
+}
+
+; (a & 0xc5600) | 0xfff1f1ff
+;
+; CHECK-LABEL: _and4:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff
+
+define i32 @and4(i32 %a) {
+entry:
+  %and = and i32 %a, 787968
+  %or = or i32 %and, -921089
+  ret i32 %or
+}
+
+; Make sure we don't shrink or optimize an XOR's immediate operand if the
+; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into
+; a BIC.
+
+; CHECK-LABEL: _xor1:
+; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38
+; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3
+
+define i32 @xor1(i32 %a) {
+entry:
+  %shl = shl i32 %a, 3
+  %xor = and i32 %shl, 56
+  %and = xor i32 %xor, 56
+  ret i32 %and
+}