Index: llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
===================================================================
--- llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -306,7 +306,7 @@
   Value *MulOp0;
   // Matching "(i * 0x01010101...) >> 24".
   if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
-       match(Op1, m_SpecificInt(MaskShift))) {
+      match(Op1, m_SpecificInt(MaskShift))) {
     Value *ShiftOp0;
     // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
     if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
@@ -402,8 +402,8 @@
 /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
 /// pessimistic codegen that has to account for setting errno and can enable
 /// vectorization.
-static bool
-foldSqrt(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI) {
+static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI,
+                     TargetLibraryInfo &TLI) {
   // Match a call to sqrt mathlib function.
   auto *Call = dyn_cast<CallInst>(&I);
   if (!Call)
@@ -825,6 +825,63 @@
   return true;
 }
 
+// calculate GEP Stride and accumulated const ModOffset.
+static bool getStrideAndModOffsetOfGEP(Value *PtrOp, APInt &ModOffset,
+                                       std::optional<APInt> &Stride,
+                                       const DataLayout &DL) {
+
+  uint64_t BW = ModOffset.getBitWidth();
+  if (isa<GlobalVariable>(PtrOp)) {
+    ModOffset = APInt(BW, 0);
+    Stride = APInt(BW, 1);
+    return true;
+  }
+
+  // If PtrOp is neither GlobalVariable nor GEP, we cannot use GEP stride,
+  // because it might not arrive back at GlobalVariable.
+  // Even if so, we can check by alignment.
+  auto *GEP = dyn_cast<GEPOperator>(PtrOp);
+  if (!GEP) {
+    Stride = std::nullopt_t;
+    return true;
+  }
+
+  Value *PtrOpV = GEP;
+  // Return a minimum gep stride, greatest common divisor of consective gep
+  // indices type sizes (c.f. Bézout's identity).
+  while ((GEP = dyn_cast<GEPOperator>(PtrOpV))) {
+    MapVector<Value *, APInt> VarOffsets;
+    if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset))
+      return false;
+
+    for (auto [V, IndexTypeSize] : VarOffsets) {
+      // TODO: handle non-inbounds wrapping
+      if (!Stride)
+        Stride = APInt(BW, IndexTypeSize.getZExtValue());
+      else
+        Stride = APIntOps::GreatestCommonDivisor(
+            *Stride, APInt(BW, IndexTypeSize.getZExtValue()));
+    }
+
+    PtrOpV = GEP->getPointerOperand();
+  }
+
+  // Check whether pointer arrives back at Global Variable.
+  // Even if it's not, we can check by alignment.
+  if (!isa<GlobalVariable>(PtrOpV)) {
+    Stride = std::nullopt_t;
+    return true;
+  }
+
+  // In consideration of signed GEP indices, non-negligible offset become
+  // remainder of divission by minimum GEP stride.
+  ModOffset = ModOffset.srem(*Stride);
+  if (ModOffset.isNegative())
+    ModOffset += *Stride;
+
+  return true;
+}
+
 /// If C is a constant patterned array and all valid loaded results for given
 /// alignment are same to a constant, return that constant.
 static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
@@ -847,16 +904,11 @@
   if (!GVSize || 4096 < GVSize)
     return false;
 
-  // Check whether pointer arrives back at Global Variable.
-  // If PtrOp is neither GlobalVariable nor GEP, it might not arrive back at
-  // GlobalVariable.
-  // TODO: implement GEP handling
   unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
   std::optional<APInt> Stride;
   APInt ConstOffset = APInt(BW, 0);
-  if (isa<GlobalVariable>(PtrOp)) {
-    Stride = APInt(BW, 1);
-  }
+  if (!getStrideAndModOffsetOfGEP(PtrOp, ConstOffset, Stride, DL))
+    return false;
 
   // Any possible offset could be multiple of GEP stride. And any valid
   // offset is multiple of load alignment, so checking only multiples of bigger
@@ -864,6 +916,7 @@
   if (auto LA = LI->getAlign();
       LA.value() <= GV->getAlign().valueOrOne().value() &&
       (!Stride || Stride->getZExtValue() < LA.value())) {
+    ConstOffset = APInt(BW, 0);
     Stride = APInt(BW, LA.value());
   }
 
Index: llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
===================================================================
--- llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
+++ llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
@@ -50,7 +50,10 @@
 ; can't be folded because ptrmask can change ptr, while preserving provenance
 define i8 @inbounds_gep_load_i8_align2_ptrmasked(i64 %idx, i64 %mask){
 ; CHECK-LABEL: @inbounds_gep_load_i8_align2_ptrmasked(
-; CHECK-NEXT:    ret i8 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 2
+; CHECK-NEXT:    ret i8 [[TMP3]]
 ;
   %1 = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 %mask)
   %2 = getelementptr inbounds i8, ptr %1, i64 %idx
@@ -58,37 +61,36 @@
   ret i8 %3
 }
 
-; TODO: this will be ret i32 65537(LE), 16777472(BE)
 define i32 @inbounds_gep_i16_load_i32_align1(i64 %idx){
-; CHECK-LABEL: @inbounds_gep_i16_load_i32_align1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray1, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; LE-LABEL: @inbounds_gep_i16_load_i32_align1(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @inbounds_gep_i16_load_i32_align1(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr inbounds i16, ptr @constarray1, i64 %idx
   %2 = load i32, ptr %1, align 1
   ret i32 %2
 }
 
-; TODO: this will be ret i32 65537(LE), 16777472(BE)
 define i32 @inbounds_gep_i32_load_i32_align8(i64 %idx){
-; CHECK-LABEL: @inbounds_gep_i32_load_i32_align8(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constarray1, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 8
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; LE-LABEL: @inbounds_gep_i32_load_i32_align8(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @inbounds_gep_i32_load_i32_align8(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr inbounds i32, ptr @constarray1, i64 %idx
   %2 = load i32, ptr %1, align 8
   ret i32 %2
 }
 
-; TODO: this will be ret i32 65547(LE), 16777472(BE)
 define i32 @inbounds_gep_i32_load_i32_const_offset(i64 %idx){
-; CHECK-LABEL: @inbounds_gep_i32_load_i32_const_offset(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray2, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LE-LABEL: @inbounds_gep_i32_load_i32_const_offset(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @inbounds_gep_i32_load_i32_const_offset(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr inbounds i16, ptr @constarray2, i64 1
   %2 = getelementptr inbounds i32, ptr %1, i64 %idx
@@ -109,15 +111,16 @@
   ret i32 %3
 }
 
+; TODO: this should NOT be folded
 ; can't be folded because if gep is non-inbounds,
 ; the offsets are silently-wrapped with two’s complement arithmetic(mod 2**64).
 ; So the load operand can be a base pointer of constarray2.
 define i32 @gep_load_i32_align2_const_offset_wrap(i64 %idx){
-; CHECK-LABEL: @gep_load_i32_align2_const_offset_wrap(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr @constarray2, i64 -2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [3 x i16], ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 2
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LE-LABEL: @gep_load_i32_align2_const_offset_wrap(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @gep_load_i32_align2_const_offset_wrap(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr i16, ptr @constarray2, i64 -2
   %2 = getelementptr [3 x i16], ptr %1, i64 %idx
@@ -125,13 +128,9 @@
   ret i32 %3
 }
 
-; TODO: this will be ret i32 42
 define i32 @inbounds_gep_i32_load_i32_const_ptr_array(i64 %idx){
 ; CHECK-LABEL: @inbounds_gep_i32_load_i32_const_ptr_array(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr @constptrarray, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; CHECK-NEXT:    ret i32 42
 ;
   %1 = getelementptr inbounds ptr, ptr @constptrarray, i64 %idx
   %2 = load ptr, ptr %1, align 4
@@ -163,16 +162,12 @@
   ret i32 %2
 }
 
-; TODO: this coould be folded into 65537(LE), 16777472(BE)
 define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){
 ; LE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
 ; LE-NEXT:    ret i32 65537
 ;
 ; BE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
-; BE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1
-; BE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
-; BE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
-; BE-NEXT:    ret i32 [[TMP3]]
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr inbounds i16, ptr @conststruct, i64 1
   %2 = getelementptr inbounds i32, ptr %1, i64 %idx