Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -2294,20 +2294,29 @@
       New = N;
       return true;
     }
-
-    /// Check to see if the specified operand of the specified instruction is a
-    /// constant integer.  If so, check to see if there are any bits set in the
-    /// constant that are not demanded.  If so, shrink the constant and return
-    /// true.
-    bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded);
-
-    /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.  This
-    /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
-    /// generalized for targets with other types of implicit widening casts.
-    bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
-                          const SDLoc &dl);
   };
 
+  /// Check to see if the specified operand of the specified instruction is a
+  /// constant integer.  If so, check to see if there are any bits set in the
+  /// constant that are not demanded.  If so, shrink the constant and return
+  /// true.
+  bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                              TargetLoweringOpt &TLO) const;
+
+  // Target hook to do target-specific const optimization, which is called by
+  // ShrinkDemandedConstant. This function should return true if the target
+  // doesn't want ShrinkDemandedConstant to further optimize the constant.
+  virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                            TargetLoweringOpt &TLO) const {
+    return false;
+  }
+
+  /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.  This
+  /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
+  /// generalized for targets with other types of implicit widening casts.
+  bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
+                        TargetLoweringOpt &TLO) const;
+
   /// Look at Op.  At this point, we know that only the DemandedMask bits of the
   /// result of Op are ever used downstream.  If we can use this information to
   /// simplify Op, create a new simplified DAG node and return true, returning
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -337,10 +337,26 @@
 /// Check to see if the specified operand of the specified instruction is a
 /// constant integer. If so, check to see if there are any bits set in the
 /// constant that are not demanded. If so, shrink the constant and return true.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
-                                                        const APInt &Demanded) {
+bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                            TargetLoweringOpt &TLO) const {
+  SelectionDAG &DAG = TLO.DAG;
   SDLoc dl(Op);
 
+  if (Op.getOpcode() == ISD::XOR) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+    // If an XOR already has all the bits set, nothing to change but don't
+    // shrink either.
+    if (!C || (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+      return false;
+  }
+
+  assert(!TLO.Old.getNode() && !TLO.New.getNode());
+
+  // Do target-specific constant optimization.
+  if (targetShrinkDemandedConstant(Op, Demanded, TLO))
+    return TLO.New.getNode();
+
   // FIXME: ISD::SELECT, ISD::SELECT_CC
   switch (Op.getOpcode()) {
   default: break;
@@ -350,10 +366,6 @@
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     if (!C) return false;
 
-    if (Op.getOpcode() == ISD::XOR &&
-        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
-      return false;
-
     // if we can expand it to have all bits set, do it
     if (C->getAPIntValue().intersects(~Demanded)) {
       EVT VT = Op.getValueType();
@@ -361,7 +373,7 @@
                                 DAG.getConstant(Demanded &
                                                 C->getAPIntValue(),
                                                 dl, VT));
-      return CombineTo(Op, New);
+      return TLO.CombineTo(Op, New);
     }
 
     break;
@@ -374,15 +386,17 @@
 /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
 /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
 /// generalized for targets with other types of implicit widening casts.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
-                                                         unsigned BitWidth,
-                                                         const APInt &Demanded,
-                                                         const SDLoc &dl) {
+bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
+                                      const APInt &Demanded,
+                                      TargetLoweringOpt &TLO) const {
   assert(Op.getNumOperands() == 2 &&
          "ShrinkDemandedOp only supports binary operators!");
   assert(Op.getNode()->getNumValues() == 1 &&
          "ShrinkDemandedOp only supports nodes with one result!");
 
+  SelectionDAG &DAG = TLO.DAG;
+  SDLoc dl(Op);
+
   // Early return, as this function cannot handle vector types.
   if (Op.getValueType().isVector())
     return false;
@@ -412,7 +426,7 @@
       bool NeedZext = DemandedSize > SmallVTBits;
       SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
                               dl, Op.getValueType(), X);
-      return CombineTo(Op, Z);
+      return TLO.CombineTo(Op, Z);
     }
   }
   return false;
@@ -509,7 +523,7 @@
         return TLO.CombineTo(Op, Op.getOperand(0));
       // If any of the set bits in the RHS are known zero on the LHS, shrink
       // the constant.
-      if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
+      if (ShrinkDemandedConstant(Op, ~LHSZero & NewMask, TLO))
         return true;
     }
 
@@ -532,10 +546,10 @@
     if ((NewMask & (KnownZero|KnownZero2)) == NewMask)
       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType()));
     // If the RHS is a constant, see if we can simplify it.
-    if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
+    if (ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask, TLO))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
       return true;
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
@@ -566,10 +580,10 @@
     if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If the RHS is a constant, see if we can simplify it.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
       return true;
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
@@ -594,7 +608,7 @@
     if ((KnownZero2 & NewMask) == NewMask)
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
       return true;
 
     // If all of the unknown bits are known to be zero on one side or the other
@@ -639,7 +653,7 @@
         }
         // if it already has all the bits set, nothing to change
         // but don't shrink either!
-      } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
+      } else if (ShrinkDemandedConstant(Op, NewMask, TLO)) {
         return true;
       }
     }
@@ -658,7 +672,7 @@
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
       return true;
 
     // Only known if known in both the LHS and RHS.
@@ -676,7 +690,7 @@
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
       return true;
 
     // Only known if known in both the LHS and RHS.
@@ -1169,7 +1183,7 @@
                              KnownOne2, TLO, Depth+1))
       return true;
     // See if the operation should be performed at a smaller bit width.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
       return true;
     LLVM_FALLTHROUGH;
   }
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -250,6 +250,9 @@
                                      APInt &KnownOne, const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
+  bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                    TargetLoweringOpt &TLO) const override;
+
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
 
   /// Returns true if the target allows unaligned memory accesses of the
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -39,6 +39,7 @@
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 
 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
@@ -53,6 +54,12 @@
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
+static cl::opt<bool>
+EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
+                         cl::desc("Enable AArch64 logical imm instruction "
+                                  "optimization"),
+                         cl::init(true));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -737,6 +744,137 @@
   return VT.changeVectorElementTypeToInteger();
 }
 
+static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
+                               const APInt &Demanded,
+                               TargetLowering::TargetLoweringOpt &TLO,
+                               unsigned NewOpc) {
+  uint64_t OldImm = Imm, NewImm, Enc;
+  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size));
+
+  // Return if the immediate is already a bimm32 or bimm64.
+  if (AArch64_AM::processLogicalImmediate(Imm & Mask, Size, Enc))
+    return false;
+
+  unsigned EltSize = Size;
+  uint64_t DemandedBits = Demanded.getZExtValue();
+
+  // Clear bits that are not demanded.
+  Imm &= DemandedBits;
+
+  while (true) {
+    // The goal here is to set the non-demanded bits in a way that minimizes
+    // the number of switching between 0 and 1. In order to achieve this goal,
+    // we set the non-demanded bits to the value of the preceding demanded bits.
+    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
+    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
+    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
+    // The final result is 0b11000011.
+    uint64_t NonDemandedBits = ~DemandedBits;
+    uint64_t InvertedImm = ~Imm & DemandedBits;
+    uint64_t RotatedImm =
+        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
+        NonDemandedBits;
+    uint64_t Sum = RotatedImm + NonDemandedBits;
+    bool Carry = NonDemandedBits & ~Sum & (1 << (EltSize - 1));
+    uint64_t Ones = (Sum + Carry) & NonDemandedBits;
+    NewImm = (Imm | Ones) & Mask;
+
+    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
+    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
+    // we halve the element size and continue the search.
+    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
+      break;
+
+    // We cannot shrink the element size any further if it is 2-bits.
+    if (EltSize == 2)
+      return false;
+
+    EltSize /= 2;
+    Mask >>= EltSize;
+    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
+
+    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
+    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
+      return false;
+
+    // Merge the upper and lower halves of Imm and DemandedBits.
+    Imm |= Hi;
+    DemandedBits |= DemandedBitsHi;
+  }
+
+  ++NumOptimizedImms;
+
+  // Replicate the element across the register width.
+  while (EltSize < Size) {
+    NewImm |= NewImm << EltSize;
+    EltSize *= 2;
+  }
+
+  (void)OldImm;
+  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
+         "demanded bits should never be altered");
+
+  // Create the new constant immediate node.
+  EVT VT = Op.getValueType();
+  unsigned Population = countPopulation(NewImm);
+  SDLoc DL(Op);
+
+  // If the new constant immediate is all-zeros or all-ones, let the target
+  // independent DAG combine optimize this node.
+  if (Population == 0 || Population == Size)
+    return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT));
+
+  // Otherwise, create a machine node so that target independent DAG combine
+  // doesn't undo this optimization.
+  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+  SDValue New(
+      TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+
+  return TLO.CombineTo(Op, New);
+}
+
+bool AArch64TargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+  // Delay this optimization to as late as possible.
+  if (!TLO.LegalOps)
+    return false;
+
+  if (!EnableOptimizeLogicalImm)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return false;
+
+  // FIXME: Check if Op has operand 1?
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+
+  unsigned Size = VT.getSizeInBits();
+  assert((Size == 32 || Size == 64) &&
+         "i32 or i64 is expected after legalization.");
+  uint64_t Imm = C->getZExtValue();
+  unsigned NewOpc;
+
+  switch (Op.getOpcode()) {
+  case ISD::AND:
+    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
+    break;
+  case ISD::OR:
+    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
+    break;
+  case ISD::XOR:
+    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
+    break;
+  default:
+    return false;
+  }
+
+  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+}
+
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2719,7 +2719,7 @@
       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                             !DCI.isBeforeLegalizeOps());
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+      if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
           TLI.SimplifyDemandedBits(BitsFrom, Demanded,
                                    KnownZero, KnownOne, TLO)) {
         DCI.CommitTargetLoweringOpt(TLO);
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3365,7 +3365,7 @@
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+    if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
         TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
       DCI.CommitTargetLoweringOpt(TLO);
     }
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -27534,7 +27534,7 @@
     APInt KnownZero, KnownOne;
     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                           DCI.isBeforeLegalizeOps());
-    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+    if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
                                  TLO)) {
       // If we changed the computation somewhere in the DAG, this change
@@ -30720,7 +30720,7 @@
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+    if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
       DCI.CommitTargetLoweringOpt(TLO);
   }
Index: lib/Target/XCore/XCoreISelLowering.cpp
===================================================================
--- lib/Target/XCore/XCoreISelLowering.cpp
+++ lib/Target/XCore/XCoreISelLowering.cpp
@@ -1605,7 +1605,7 @@
         TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                               !DCI.isBeforeLegalizeOps());
         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
+        if (TLI.ShrinkDemandedConstant(OutVal, DemandedMask, TLO) ||
             TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
                                      TLO))
           DCI.CommitTargetLoweringOpt(TLO);
@@ -1622,7 +1622,7 @@
         TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                               !DCI.isBeforeLegalizeOps());
         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
+        if (TLI.ShrinkDemandedConstant(Time, DemandedMask, TLO) ||
             TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
                                      TLO))
           DCI.CommitTargetLoweringOpt(TLO);
Index: test/CodeGen/AArch64/bitreverse.ll
===================================================================
--- test/CodeGen/AArch64/bitreverse.ll
+++ test/CodeGen/AArch64/bitreverse.ll
@@ -28,10 +28,8 @@
 ; CHECK-DAG: lsr [[S2:w.*]], [[H2]], #2
 ; CHECK-DAG: orr [[R2:w.*]], [[S2]], [[L2]], lsl #2
 
-; CHECK-DAG: mov [[P1:w.*]], #1426063360
-; CHECK-DAG: mov [[N1:w.*]], #-1442840576
-; CHECK-DAG: and [[L1:w.*]], [[R2]], [[P1]]
-; CHECK-DAG: and [[H1:w.*]], [[R2]], [[N1]]
+; CHECK-DAG: and [[L1:w.*]], [[R2]], #0x55555555
+; CHECK-DAG: and [[H1:w.*]], [[R2]], #0xaaaaaaaa
 ; CHECK-DAG: lsr [[S1:w.*]], [[H1]], #1
 ; CHECK-DAG: orr [[R1:w.*]], [[S1]], [[L1]], lsl #1
 
Index: test/CodeGen/AArch64/optimize-imm.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/optimize-imm.ll
@@ -0,0 +1,64 @@
+; RUN: llc -o - %s -mtriple=aarch64-- | FileCheck %s
+
+; CHECK-LABEL: and1:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd
+
+define void @and1(i32 %a, i8* nocapture %p) {
+entry:
+  %and = and i32 %a, 253
+  %conv = trunc i32 %and to i8
+  store i8 %conv, i8* %p, align 1
+  ret void
+}
+
+; (a & 0x3dfd) | 0xffffc000
+;
+; CHECK-LABEL: and2:
+; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd
+
+define i32 @and2(i32 %a) {
+entry:
+  %and = and i32 %a, 15869
+  %or = or i32 %and, -16384
+  ret i32 %or
+}
+
+; (a & 0x19) | 0xffffffc0
+;
+; CHECK-LABEL: and3:
+; CHECK: and {{w[0-9]+}}, w0, #0x99999999
+
+define i32 @and3(i32 %a) {
+entry:
+  %and = and i32 %a, 25
+  %or = or i32 %and, -64
+  ret i32 %or
+}
+
+; (a & 0xc5600) | 0xfff1f1ff
+;
+; CHECK-LABEL: and4:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff
+
+define i32 @and4(i32 %a) {
+entry:
+  %and = and i32 %a, 787968
+  %or = or i32 %and, -921089
+  ret i32 %or
+}
+
+; Make sure we don't shrink or optimize an XOR's immediate operand if the
+; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into
+; a BIC.
+
+; CHECK-LABEL: xor1:
+; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38
+; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3
+
+define i32 @xor1(i32 %a) {
+entry:
+  %shl = shl i32 %a, 3
+  %xor = and i32 %shl, 56
+  %and = xor i32 %xor, 56
+  ret i32 %and
+}