Index: llvm/lib/Target/AArch64/AArch64.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64.h
+++ llvm/lib/Target/AArch64/AArch64.h
@@ -51,6 +51,7 @@
 FunctionPass *createFalkorHWPFFixPass();
 FunctionPass *createFalkorMarkStridedAccessesPass();
 FunctionPass *createAArch64BranchTargetsPass();
+FunctionPass *createAArch64MIPeepholeOptPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
@@ -82,6 +83,7 @@
 void initializeAArch64SpeculationHardeningPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
+void initializeAArch64MIPeepholeOptPass(PassRegistry &);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -0,0 +1,177 @@
+//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs below peephole optimizations on MIR level.
+//
+// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
+//    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+//
+//    The mov pseudo instruction could be expanded to multiple mov instructions
+//    later. In this case, we could try to split the constant  operand of mov
+//    instruction into two bitmask immediates. It makes two AND instructions
+//    intead of multiple `mov` + `and` instructions.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExpandImm.h"
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-mi-peephole-opt"
+
+namespace {
+
+struct AArch64MIPeepholeOpt : public MachineFunctionPass {
+  static char ID;
+
+  AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
+    initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  const AArch64InstrInfo *TII;
+  MachineLoopInfo *MLI;
+  MachineRegisterInfo *MRI;
+
+  template <typename T>
+  bool visitAND(MachineInstr &MI, SmallVectorImpl<MachineInstr *> &ToBeRemoved);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AArch64 MI Peephole Optimization pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+char AArch64MIPeepholeOpt::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
+                "AArch64 MI Peephole Optimization", false, false)
+
+template <typename T>
+bool AArch64MIPeepholeOpt::visitAND(
+    MachineInstr &MI, SmallVectorImpl<MachineInstr *> &ToBeRemoved) {
+  // Try below transformation.
+  //
+  // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
+  // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+  //
+  // The mov pseudo instruction could be expanded to multiple mov instructions
+  // later. Let's try to split the constant operand of mov instruction into two
+  // bitmask immediates. It makes only two AND instructions intead of multiple
+  // mov + and instructions.
+
+  // Check whether AND's MBB is in loop and the AND is loop invariant.
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineLoop *L = MLI->getLoopFor(MBB);
+  if (L && !L->isLoopInvariant(MI))
+    return false;
+
+  // Check whether AND's operand is MOV with immediate.
+  MachineInstr *DefMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
+  MachineInstr *SubregToRegMI = nullptr;
+  // If it is SUBREG_TO_REG, check its operand.
+  if (DefMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
+    SubregToRegMI = DefMI;
+    DefMI = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg());
+  }
+
+  if (DefMI->getOpcode() != AArch64::MOVi32imm &&
+      DefMI->getOpcode() != AArch64::MOVi64imm)
+    return false;
+
+  // If current bitmask immediate is valid, we do not need to split it.
+  unsigned RegSize = sizeof(T) * 8;
+  uint64_t Imm = DefMI->getOperand(1).getImm();
+  if (AArch64_AM::isLogicalImmediate(Imm, RegSize))
+    return false;
+
+  // Check whether the bitmask immedaite can be split.
+  T UImm = static_cast<T>(Imm);
+  if (!AArch64_AM::isValidAndSplitBitmaskImm(UImm, RegSize))
+    return false;
+
+  // Split the bitmask immeidate into two.
+  T Imm1Enc = AArch64_AM::splitAndBitmaskImm(UImm, RegSize, true);
+  T Imm2Enc = AArch64_AM::splitAndBitmaskImm(UImm, RegSize, false);
+
+  // Create new AND MIs.
+  DebugLoc DL = MI.getDebugLoc();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register NewTmpReg = MRI->createVirtualRegister(MRI->getRegClass(DstReg));
+  unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri;
+
+  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+      .addReg(SrcReg)
+      .addImm(Imm1Enc);
+
+  BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
+      .addReg(NewTmpReg)
+      .addImm(Imm2Enc);
+
+  ToBeRemoved.push_back(&MI);
+
+  // If mov instruction has one use, delete it.
+  if (MRI->hasOneUse(DefMI->getOperand(0).getReg()))
+    ToBeRemoved.push_back(DefMI);
+
+  // If SUBREG_TO_REG instruction has one use, delete it.
+  if (SubregToRegMI && MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
+    ToBeRemoved.push_back(SubregToRegMI);
+
+  return true;
+}
+
+bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MRI = &MF.getRegInfo();
+
+  if (!MRI->isSSA())
+    return false;
+
+  bool Changed = false;
+  SmallVector<MachineInstr *, 4> ToBeRemoved;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case AArch64::ANDWrr:
+        Changed = visitAND<uint32_t>(MI, ToBeRemoved);
+        break;
+      case AArch64::ANDXrr:
+        Changed = visitAND<uint64_t>(MI, ToBeRemoved);
+        break;
+      }
+    }
+  }
+
+  for (MachineInstr *MI : ToBeRemoved)
+    MI->eraseFromParent();
+
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
+  return new AArch64MIPeepholeOpt();
+}
Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -195,6 +195,7 @@
   initializeAArch64DeadRegisterDefinitionsPass(*PR);
   initializeAArch64ExpandPseudoPass(*PR);
   initializeAArch64LoadStoreOptPass(*PR);
+  initializeAArch64MIPeepholeOptPass(*PR);
   initializeAArch64SIMDInstrOptPass(*PR);
   initializeAArch64O0PreLegalizerCombinerPass(*PR);
   initializeAArch64PreLegalizerCombinerPass(*PR);
@@ -479,6 +480,7 @@
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
+  void addMachineSSAOptimization() override;
   bool addILPOpts() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -649,6 +651,14 @@
   return false;
 }
 
+void AArch64PassConfig::addMachineSSAOptimization() {
+  // Run default MachineSSAOptimization first.
+  TargetPassConfig::addMachineSSAOptimization();
+
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64MIPeepholeOptPass());
+}
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
Index: llvm/lib/Target/AArch64/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AArch64/CMakeLists.txt
+++ llvm/lib/Target/AArch64/CMakeLists.txt
@@ -66,6 +66,7 @@
   AArch64LowerHomogeneousPrologEpilog.cpp
   AArch64MachineFunctionInfo.cpp
   AArch64MacroFusion.cpp
+  AArch64MIPeepholeOpt.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
   AArch64PBQPRegAlloc.cpp
Index: llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
===================================================================
--- llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 
+#include "AArch64ExpandImm.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/bit.h"
@@ -337,6 +338,66 @@
   return true;
 }
 
+template <typename T>
+static inline bool isValidAndSplitBitmaskImm(T Imm, unsigned regSize) {
+  assert((regSize == 32 || regSize == 64) &&
+         "Invalid regSize for AndsplitBitmaskImm");
+  T UImm = static_cast<T>(Imm);
+  if (isLogicalImmediate(UImm, regSize))
+    return false;
+
+  // If this immediate can be handled by one instruction, do not split it.
+  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+  AArch64_IMM::expandMOVImm(UImm, regSize, Insn);
+  if (Insn.size() == 1)
+    return false;
+
+  // The bitmask immediate consists of consecutive ones.  Let's say there is
+  // constant 0b00000000001000000000010000000000 which does not consist of
+  // consecutive ones. We can split it in to two bitmask immediate like
+  // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
+  // If we do AND with these two bitmask immediate, we can see original one.
+  unsigned LowestBitSet = countTrailingZeros(UImm);
+  unsigned HighestBitSet = Log2_64(UImm);
+
+  // Create a mask which is filled with one from the position of lowest bit set
+  // to the position of highest bit set.
+  T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
+              (static_cast<T>(1) << LowestBitSet);
+  // Create a mask which is filled with one outside the position of lowest bit
+  // set and the position of highest bit set.
+  T NewImm2 = UImm | ~NewImm1;
+
+  // If the splitted value is not valid bitmask immediate, do not split this
+  // constant.
+  if (!isLogicalImmediate(NewImm2, regSize))
+    return false;
+  return true;
+}
+
+template <typename T>
+static inline T splitAndBitmaskImm(T Imm, unsigned regSize, bool FirstImm) {
+  assert((regSize == 32 || regSize == 64) &&
+         "Invalid regSize for AndsplitBitmaskImm");
+
+  unsigned LowestBitSet = countTrailingZeros(Imm);
+  unsigned HighestBitSet = Log2_64(Imm);
+
+  // Create a mask which is filled with one from the position of lowest bit set
+  // to the position of highest bit set.
+  T Imm1 = (static_cast<T>(2) << HighestBitSet) -
+           (static_cast<T>(1) << LowestBitSet);
+  T Imm1Enc = encodeLogicalImmediate(Imm1, regSize);
+  if (FirstImm)
+    return Imm1Enc;
+
+  // Create a mask which is filled with one outside the position of lowest bit
+  // set and the position of highest bit set.
+  T Imm2 = Imm | ~Imm1;
+  T Imm2Enc = AArch64_AM::encodeLogicalImmediate(Imm2, regSize);
+  return Imm2Enc;
+}
+
 //===----------------------------------------------------------------------===//
 // Floating-point Immediates
 //
Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll
===================================================================
--- llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -40,7 +40,7 @@
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
@@ -131,6 +131,7 @@
 ; CHECK-NEXT:       Machine code sinking
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
+; CHECK-NEXT:       AArch64 MI Peephole Optimization pass
 ; CHECK-NEXT:       AArch64 Dead register definitions
 ; CHECK-NEXT:       Detect Dead Lanes
 ; CHECK-NEXT:       Process Implicit Definitions
Index: llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+define i8 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0x3ffc00
+; CHECK-NEXT:    and w8, w8, #0xffe007ff
+; CHECK-NEXT:    cmp w8, #1024
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %a, 2098176
+  %cmp = icmp eq i32 %and, 1024
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i8 @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135
+; CHECK-NEXT:    and w8, w0, w8
+; CHECK-NEXT:    cmp w8, #1024
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %a, 135
+  %cmp = icmp eq i32 %and, 1024
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i8 @test3(i32 %a) {
+; CHECK-LABEL: test3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    and w8, w0, w8
+; CHECK-NEXT:    cmp w8, #1024
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %a, 2163712
+  %cmp = icmp eq i32 %and, 1024
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+define i8 @test4(i64 %a) {
+; CHECK-LABEL: test4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and x8, x0, #0x3ffc00
+; CHECK-NEXT:    and x8, x8, #0xffffffffffe007ff
+; CHECK-NEXT:    cmp x8, #1024
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %and = and i64 %a, 2098176
+  %cmp = icmp eq i64 %and, 1024
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+define i8 @test5(i64 %a) {
+; CHECK-LABEL: test5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and x8, x0, #0x3ffffc000
+; CHECK-NEXT:    and x8, x8, #0xfffffffe00007fff
+; CHECK-NEXT:    cmp x8, #1024
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %and = and i64 %a, 8589950976
+  %cmp = icmp eq i64 %and, 1024
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i8 @test6(i64 %a) {
+; CHECK-LABEL: test6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135
+; CHECK-NEXT:    and x8, x0, x8
+; CHECK-NEXT:    cmp x8, #1024
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %and = and i64 %a, 135
+  %cmp = icmp eq i64 %and, 1024
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i8 @test7(i64 %a) {
+; CHECK-LABEL: test7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    and x8, x0, x8
+; CHECK-NEXT:    cmp x8, #1024
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %and = and i64 %a, 2163712
+  %cmp = icmp eq i64 %and, 1024
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+; The split bitmask immediates should be hoisted outside loop because they are
+; loop invariant.
+define void @test8(i64 %a, i64* noalias %src, i64* noalias %dst, i64 %n) {
+; CHECK-LABEL: test8:
+; CHECK:       // %bb.0: // %loop.ph
+; CHECK-NEXT:    and x9, x0, #0x3ffc00
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    and x9, x9, #0xffffffffffe007ff
+; CHECK-NEXT:    b .LBB7_2
+; CHECK-NEXT:  .LBB7_1: // %for.inc
+; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    add x8, x8, #1
+; CHECK-NEXT:    cmp x8, x3
+; CHECK-NEXT:    b.gt .LBB7_4
+; CHECK-NEXT:  .LBB7_2: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.hs .LBB7_1
+; CHECK-NEXT:  // %bb.3: // %if.then
+; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    lsl x10, x8, #3
+; CHECK-NEXT:    ldr x11, [x1, x10]
+; CHECK-NEXT:    str x11, [x2, x10]
+; CHECK-NEXT:    b .LBB7_1
+; CHECK-NEXT:  .LBB7_4: // %exit
+; CHECK-NEXT:    ret
+loop.ph:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %inc, %for.inc ], [ 0, %loop.ph ]
+  %and = and i64 %a, 2098176
+  %cmp = icmp ult i64 %iv, %and
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %src.arrayidx = getelementptr inbounds i64, i64* %src, i64 %iv
+  %val = load i64, i64* %src.arrayidx
+  %dst.arrayidx = getelementptr inbounds i64, i64* %dst, i64 %iv
+  store i64 %val, i64* %dst.arrayidx
+  br label %for.inc
+
+if.else:
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i64 %iv, 1
+  %cond = icmp sgt i64 %inc, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; This constant should not be split because the `and` is not loop invariant.
+define i32 @test9(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: test9:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB8_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, #1024
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    movk w9, #32, lsl #16
+; CHECK-NEXT:  .LBB8_2: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr w10, [x1], #4
+; CHECK-NEXT:    subs x8, x8, #1
+; CHECK-NEXT:    and w10, w10, w9
+; CHECK-NEXT:    str w10, [x0], #4
+; CHECK-NEXT:    b.ne .LBB8_2
+; CHECK-NEXT:  .LBB8_3: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret i32 0
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %and = and i32 %0, 2098176
+  %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  store i32 %and, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
Index: llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
===================================================================
--- llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -245,10 +245,9 @@
 define i32 @n0_badconstmask(i32 %x, i32 %y) {
 ; CHECK-LABEL: n0_badconstmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #256
-; CHECK-NEXT:    movk w9, #65280, lsl #16
+; CHECK-NEXT:    and w9, w1, #0xffffff00
 ; CHECK-NEXT:    and w8, w0, #0xffff00
-; CHECK-NEXT:    and w9, w1, w9
+; CHECK-NEXT:    and w9, w9, #0xff0001ff
 ; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %mx = and i32 %x, 16776960