Index: llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -306,7 +306,7 @@ Value *MulOp0; // Matching "(i * 0x01010101...) >> 24". if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) && - match(Op1, m_SpecificInt(MaskShift))) { + match(Op1, m_SpecificInt(MaskShift))) { Value *ShiftOp0; // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)". if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)), @@ -402,8 +402,8 @@ /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids /// pessimistic codegen that has to account for setting errno and can enable /// vectorization. -static bool -foldSqrt(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI) { +static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI, + TargetLibraryInfo &TLI) { // Match a call to sqrt mathlib function. auto *Call = dyn_cast(&I); if (!Call) @@ -825,6 +825,63 @@ return true; } +// calculate GEP Stride and accumulated const ModOffset. +static bool getStrideAndModOffsetOfGEP(Value *PtrOp, APInt &ModOffset, + std::optional &Stride, + const DataLayout &DL) { + + uint64_t BW = ModOffset.getBitWidth(); + if (isa(PtrOp)) { + ModOffset = APInt(BW, 0); + Stride = APInt(BW, 1); + return true; + } + + // If PtrOp is neither GlobalVariable nor GEP, we cannot use GEP stride, + // because it might not arrive back at GlobalVariable. + // Even if so, we can check by alignment. + auto *GEP = dyn_cast(PtrOp); + if (!GEP) { + Stride = std::nullopt_t; + return true; + } + + Value *PtrOpV = GEP; + // Return a minimum gep stride, greatest common divisor of consective gep + // indices type sizes (c.f. Bézout's identity). + while ((GEP = dyn_cast(PtrOpV))) { + MapVector VarOffsets; + if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset)) + return false; + + for (auto [V, IndexTypeSize] : VarOffsets) { + // TODO: handle non-inbounds wrapping + if (!Stride) + Stride = APInt(BW, IndexTypeSize.getZExtValue()); + else + Stride = APIntOps::GreatestCommonDivisor( + *Stride, APInt(BW, IndexTypeSize.getZExtValue())); + } + + PtrOpV = GEP->getPointerOperand(); + } + + // Check whether pointer arrives back at Global Variable. + // Even if it's not, we can check by alignment. + if (!isa(PtrOpV)) { + Stride = std::nullopt_t; + return true; + } + + // In consideration of signed GEP indices, non-negligible offset become + // remainder of divission by minimum GEP stride. + ModOffset = ModOffset.srem(*Stride); + if (ModOffset.isNegative()) + ModOffset += *Stride; + + return true; +} + /// If C is a constant patterned array and all valid loaded results for given /// alignment are same to a constant, return that constant. static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { @@ -847,16 +904,11 @@ if (!GVSize || 4096 < GVSize) return false; - // Check whether pointer arrives back at Global Variable. - // If PtrOp is neither GlobalVariable nor GEP, it might not arrive back at - // GlobalVariable. - // TODO: implement GEP handling unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType()); std::optional Stride; APInt ConstOffset = APInt(BW, 0); - if (isa(PtrOp)) { - Stride = APInt(BW, 1); - } + if (!getStrideAndModOffsetOfGEP(PtrOp, ConstOffset, Stride, DL)) + return false; // Any possible offset could be multiple of GEP stride. And any valid // offset is multiple of load alignment, so checking only multiples of bigger @@ -864,6 +916,7 @@ if (auto LA = LI->getAlign(); LA.value() <= GV->getAlign().valueOrOne().value() && (!Stride || Stride->getZExtValue() < LA.value())) { + ConstOffset = APInt(BW, 0); Stride = APInt(BW, LA.value()); } Index: llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll =================================================================== --- llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll +++ llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll @@ -50,7 +50,10 @@ ; can't be folded because ptrmask can change ptr, while preserving provenance define i8 @inbounds_gep_load_i8_align2_ptrmasked(i64 %idx, i64 %mask){ ; CHECK-LABEL: @inbounds_gep_load_i8_align2_ptrmasked( -; CHECK-NEXT: ret i8 1 +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 [[MASK:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDX:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 2 +; CHECK-NEXT: ret i8 [[TMP3]] ; %1 = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 %mask) %2 = getelementptr inbounds i8, ptr %1, i64 %idx @@ -58,37 +61,36 @@ ret i8 %3 } -; TODO: this will be ret i32 65537(LE), 16777472(BE) define i32 @inbounds_gep_i16_load_i32_align1(i64 %idx){ -; CHECK-LABEL: @inbounds_gep_i16_load_i32_align1( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray1, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1 -; CHECK-NEXT: ret i32 [[TMP2]] +; LE-LABEL: @inbounds_gep_i16_load_i32_align1( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @inbounds_gep_i16_load_i32_align1( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i16, ptr @constarray1, i64 %idx %2 = load i32, ptr %1, align 1 ret i32 %2 } -; TODO: this will be ret i32 65537(LE), 16777472(BE) define i32 @inbounds_gep_i32_load_i32_align8(i64 %idx){ -; CHECK-LABEL: @inbounds_gep_i32_load_i32_align8( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constarray1, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 8 -; CHECK-NEXT: ret i32 [[TMP2]] +; LE-LABEL: @inbounds_gep_i32_load_i32_align8( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @inbounds_gep_i32_load_i32_align8( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i32, ptr @constarray1, i64 %idx %2 = load i32, ptr %1, align 8 ret i32 %2 } -; TODO: this will be ret i32 65547(LE), 16777472(BE) define i32 @inbounds_gep_i32_load_i32_const_offset(i64 %idx){ -; CHECK-LABEL: @inbounds_gep_i32_load_i32_const_offset( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray2, i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: ret i32 [[TMP3]] +; LE-LABEL: @inbounds_gep_i32_load_i32_const_offset( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @inbounds_gep_i32_load_i32_const_offset( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i16, ptr @constarray2, i64 1 %2 = getelementptr inbounds i32, ptr %1, i64 %idx @@ -109,15 +111,16 @@ ret i32 %3 } +; TODO: this should NOT be folded ; can't be folded because if gep is non-inbounds, ; the offsets are silently-wrapped with two’s complement arithmetic(mod 2**64). ; So the load operand can be a base pointer of constarray2. define i32 @gep_load_i32_align2_const_offset_wrap(i64 %idx){ -; CHECK-LABEL: @gep_load_i32_align2_const_offset_wrap( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr @constarray2, i64 -2 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [3 x i16], ptr [[TMP1]], i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 2 -; CHECK-NEXT: ret i32 [[TMP3]] +; LE-LABEL: @gep_load_i32_align2_const_offset_wrap( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @gep_load_i32_align2_const_offset_wrap( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr i16, ptr @constarray2, i64 -2 %2 = getelementptr [3 x i16], ptr %1, i64 %idx @@ -125,13 +128,9 @@ ret i32 %3 } -; TODO: this will be ret i32 42 define i32 @inbounds_gep_i32_load_i32_const_ptr_array(i64 %idx){ ; CHECK-LABEL: @inbounds_gep_i32_load_i32_const_ptr_array( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds ptr, ptr @constptrarray, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: ret i32 42 ; %1 = getelementptr inbounds ptr, ptr @constptrarray, i64 %idx %2 = load ptr, ptr %1, align 4 @@ -163,16 +162,12 @@ ret i32 %2 } -; TODO: this coould be folded into 65537(LE), 16777472(BE) define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){ ; LE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset( ; LE-NEXT: ret i32 65537 ; ; BE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset( -; BE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1 -; BE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]] -; BE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; BE-NEXT: ret i32 [[TMP3]] +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i16, ptr @conststruct, i64 1 %2 = getelementptr inbounds i32, ptr %1, i64 %idx