Index: llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -187,7 +187,15 @@
   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
     return selectAddrModeIndexed(Root, Width / 8);
   }
+
+  bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI) const;
+  ComplexRendererFns
+  selectAddrModeShiftedExtendXReg(MachineOperand &Root,
+                                  unsigned SizeInBytes) const;
   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
+  ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
+                                       unsigned SizeInBytes) const;
 
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
 
@@ -1238,8 +1246,8 @@
   if (DstSize != 64)
     return false;
 
-  // Check if we can do any folding from GEPs etc. into the load.
-  auto ImmFn = selectAddrModeRegisterOffset(I.getOperand(1));
+  // Check if we can do any folding from GEPs/shifts etc. into the load.
+  auto ImmFn = selectAddrModeXRO(I.getOperand(1), MemBytes);
   if (!ImmFn)
     return false;
 
@@ -3995,6 +4003,98 @@
   }};
 }
 
+/// Return true if it is worth folding MI into an extended register. That is,
+/// if it's safe to pull it into the addressing mode of a load or store as a
+/// shift.
+bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
+    MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+  // Always fold if there is one use, or if we're optimizing for size.
+  Register DefReg = MI.getOperand(0).getReg();
+  if (MRI.hasOneUse(DefReg) ||
+      MI.getParent()->getParent()->getFunction().hasMinSize())
+    return true;
+
+  // It's better to avoid folding and recomputing shifts when we don't have a
+  // fastpath.
+  if (!STI.hasLSLFast())
+    return false;
+
+  // We have a fastpath, so folding a shift in and potentially computing it
+  // many times may be beneficial. Check if this is only used in memory ops.
+  // If it is, then we should fold.
+  return all_of(MRI.use_instructions(DefReg),
+                [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3, lsl #3]
+///
+/// Where x2 is the base register, and x3 is an offset register. The shift-left
+/// is a constant value specific to this load instruction. That is, we'll never
+/// see anything other than a 3 here (which corresponds to the size of the
+/// element being loaded.)
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
+    MachineOperand &Root, unsigned SizeInBytes) const {
+  if (!Root.isReg())
+    return None;
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // Make sure that the memory op is a valid size.
+  int64_t LegalShiftVal = Log2_32(SizeInBytes);
+  if (LegalShiftVal == 0)
+    return None;
+
+  // We want to find something like this:
+  //
+  // val = G_CONSTANT LegalShiftVal
+  // shift = G_SHL off_reg val
+  // ptr = G_GEP base_reg shift
+  // x = G_LOAD ptr
+  //
+  // And fold it into this addressing mode:
+  //
+  // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+
+  // Check if we can find the G_GEP.
+  MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
+  if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
+    return None;
+
+  // Now try to match the G_SHL.
+  MachineInstr *Shl =
+      getOpcodeDef(TargetOpcode::G_SHL, Gep->getOperand(2).getReg(), MRI);
+  if (!Shl || !isWorthFoldingIntoExtendedReg(*Shl, MRI))
+    return None;
+
+  // Now, try to find the specific G_CONSTANT.
+  auto ValAndVReg =
+      getConstantVRegValWithLookThrough(Shl->getOperand(2).getReg(), MRI);
+  if (!ValAndVReg)
+    return None;
+
+  // The value must fit into 3 bits, and must be positive. Make sure that is
+  // true.
+  int64_t ImmVal = ValAndVReg->Value;
+  if ((ImmVal & 0x7) != ImmVal)
+    return None;
+
+  // We are only allowed to shift by LegalShiftVal. This shift value is built
+  // into the instruction, so we can't just use whatever we want.
+  if (ImmVal != LegalShiftVal)
+    return None;
+
+  // We can use the LHS of the GEP as the base, and the LHS of the shift as an
+  // offset. Signify that we are shifting by setting the shift flag to 1.
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); },
+      [=](MachineInstrBuilder &MIB) { MIB.add(Shl->getOperand(1)); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(1); },
+  }};
+}
+
 /// This is used for computing addresses like this:
 ///
 /// ldr x1, [x2, x3]
@@ -4008,11 +4108,6 @@
     MachineOperand &Root) const {
   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
 
-  // If we have a constant offset, then we probably don't want to match a
-  // register offset.
-  if (isBaseWithConstantOffset(Root, MRI))
-    return None;
-
   // We need a GEP.
   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
   if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
@@ -4033,6 +4128,28 @@
   }};
 }
 
+/// This is intended to be equivalent to selectAddrModeXRO in
+/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
+                                              unsigned SizeInBytes) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // If we have a constant offset, then we probably don't want to match a
+  // register offset.
+  if (isBaseWithConstantOffset(Root, MRI))
+    return None;
+
+  // Try to fold shifts into the addressing mode.
+  auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
+  if (AddrModeFns)
+    return AddrModeFns;
+
+  // If that doesn't work, see if it's possible to fold in registers from
+  // a GEP.
+  return selectAddrModeRegisterOffset(Root);
+}
+
 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
 /// should only match when there is an offset that is not valid for a scaled
 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
Index: llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -5,6 +5,15 @@
   define void @ldrxrox_breg_oreg(i64* %addr) { ret void }
   define void @ldrdrox_breg_oreg(i64* %addr) { ret void }
   define void @more_than_one_use(i64* %addr) { ret void }
+  define void @ldrxrox_shl(i64* %addr) { ret void }
+  define void @ldrdrox_shl(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_1(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_2(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_lsl_fast(i64* %addr) #1 { ret void }
+  define void @more_than_one_use_shl_lsl_slow(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_minsize(i64* %addr) #0 { ret void }
+  attributes #0 = { optsize minsize }
+  attributes #1 = { "target-features"="+lsl-fast" }
 ...
 
 ---
@@ -88,3 +97,236 @@
     %6:gpr(s64) = G_ADD %5, %4
     $x0 = COPY %6(s64)
     RET_ReallyLR implicit $x0
+
+...
+---
+name:            ldrxrox_shl
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: ldrxrox_shl
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: $x2 = COPY [[LDRXroX]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    $x2 = COPY %5(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            ldrdrox_shl
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $d2
+    ; CHECK-LABEL: name: ldrdrox_shl
+    ; CHECK: liveins: $x0, $x1, $d2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: $d2 = COPY [[LDRDroX]]
+    ; CHECK: RET_ReallyLR implicit $d2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:fpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    $d2 = COPY %5(s64)
+    RET_ReallyLR implicit $d2
+
+...
+---
+name:            more_than_one_use_shl_1
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that we can still fall back to the register-register addressing
+    ; mode when we fail to pull in the shift.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_1
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[UBFMXri]], 0, 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+    ; CHECK: $x2 = COPY [[ADDXrr]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_ADD %2, %1
+    %7:gpr(s64) = G_ADD %5, %6
+    $x2 = COPY %7(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_2
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that when the GEP is used outside a memory op, we don't do any
+    ; folding at all.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_2
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]]
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
+    ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]]
+    ; CHECK: $x2 = COPY [[ADDXrr2]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_ADD %2, %1
+    %7:gpr(s64) = G_ADD %5, %6
+    %8:gpr(s64) = G_PTRTOINT %4
+    %9:gpr(s64) = G_ADD %8, %7
+    $x2 = COPY %9(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_lsl_fast
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that when we have a fastpath for shift-left, we perform the folding
+    ; if it has more than one use.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_lsl_fast
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
+    ; CHECK: $x2 = COPY [[ADDXrr]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %7:gpr(s64) = G_ADD %5, %6
+    $x2 = COPY %7(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_lsl_slow
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that we don't fold into multiple memory ops when we don't have a
+    ; fastpath for shift-left.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_lsl_slow
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+    ; CHECK: $x2 = COPY [[ADDXrr1]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %7:gpr(s64) = G_ADD %5, %6
+    $x2 = COPY %7(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_minsize
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that when we're optimizing for size, we'll do the folding no matter
+    ; what.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_minsize
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+    ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]]
+    ; CHECK: $x2 = COPY [[ADDXrr2]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_ADD %2, %1
+    %7:gpr(s64) = G_ADD %5, %6
+    %8:gpr(s64) = G_PTRTOINT %4
+    %9:gpr(s64) = G_ADD %8, %7
+    $x2 = COPY %9(s64)
+    RET_ReallyLR implicit $x2