Index: llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -824,6 +825,83 @@ return true; } +/// If C is a constant patterned array and all valid loaded results for given +/// alignment are same to a constant, return that constant. +static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { + + auto *LI = dyn_cast(&I); + if (!LI || LI->isVolatile()) + return false; + + // We can only fold the load if it is from a constant global with definitive + // initializer. Skip expensive logic if this is not the case. + auto *PtrOp = LI->getOperand(0); + auto *GV = dyn_cast(getUnderlyingObject(PtrOp)); + if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) + return false; + + Type *LoadTy = LI->getType(); + Constant *C = GV->getInitializer(); + + // Bail for large initializers in excess of 4K to avoid allocating + // too much memory. + uint64_t GVSize = DL.getTypeAllocSize(C->getType()); + if (!GVSize || 4096 < GVSize) + return false; + + // Calculate constant offset and minimum GEP stride + unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType()); + APInt ConstOffset(BW, 0); + // TODO: make Stride optional APInt + uint64_t Stride = 1; + if (auto GEP = dyn_cast(PtrOp)) { + Stride = DL.getTypeAllocSize(GEP->getSourceElementType()); + Value *PtrOpV = GEP; + + // Return a minimum gep stride, greatest common divisor of consective gep + // indices type sizes (c.f. Bézout's identity). + while (auto GEP = dyn_cast(PtrOpV)) { + MapVector VarOffsets; + if (!GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) + return false; + + for (auto [_, IndexTypeSize] : VarOffsets) + Stride = APIntOps::GreatestCommonDivisor( + APInt(BW, Stride), APInt(BW, IndexTypeSize.getZExtValue())) + .getZExtValue(); + + PtrOpV = GEP->getPointerOperand(); + } + + // In consideration of signed GEP indices, non-negligible offset become + // remainder of divission by minimum GEP stride. + ConstOffset = ConstOffset.srem(APInt(BW, Stride)); + } + + // Any possible offset could be multiple of minimum GEP stride. And any valid + // offset is multiple of load alignment, so checking onle multiples of bigger + // one is sufficient to say results' equality. + if (auto LA = LI->getAlign(); + LA.value() <= GV->getAlign().valueOrOne().value() && + Stride < LA.value()) { + ConstOffset = APInt(BW, 0); + Stride = LA.value(); + } + + Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL); + if (!Ca) + return false; + + unsigned E = GVSize - DL.getTypeAllocSize(LoadTy); + for (; ConstOffset.getZExtValue() <= E; ConstOffset += APInt(BW, Stride)) + if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL)) + return false; + + I.replaceAllUsesWith(Ca); + + return true; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -850,6 +928,7 @@ MadeChange |= tryToFPToSat(I, TTI); MadeChange |= tryToRecognizeTableBasedCttz(I); MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA); + MadeChange |= foldPatternedLoads(I, DL); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make // bugs. Index: llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll =================================================================== --- llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll +++ llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll @@ -14,9 +14,7 @@ define i8 @gep_load_i8_align2(i64 %idx){ ; CHECK-LABEL: @gep_load_i8_align2( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray1, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 2 -; CHECK-NEXT: ret i8 [[TMP2]] +; CHECK-NEXT: ret i8 1 ; %1 = getelementptr inbounds i8, ptr @constarray1, i64 %idx %2 = load i8, ptr %1, align 2 @@ -48,10 +46,11 @@ } define i32 @gep_i16_load_i32_align1(i64 %idx){ -; CHECK-LABEL: @gep_i16_load_i32_align1( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray1, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1 -; CHECK-NEXT: ret i32 [[TMP2]] +; LE-LABEL: @gep_i16_load_i32_align1( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @gep_i16_load_i32_align1( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i16, ptr @constarray1, i64 %idx %2 = load i32, ptr %1, align 1 @@ -59,10 +58,11 @@ } define i32 @gep_i32_load_i32_align8(i64 %idx){ -; CHECK-LABEL: @gep_i32_load_i32_align8( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constarray1, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 8 -; CHECK-NEXT: ret i32 [[TMP2]] +; LE-LABEL: @gep_i32_load_i32_align8( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @gep_i32_load_i32_align8( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i32, ptr @constarray1, i64 %idx %2 = load i32, ptr %1, align 8 @@ -70,11 +70,11 @@ } define i32 @gep_i32_load_i32_with_const_offset(i64 %idx){ -; CHECK-LABEL: @gep_i32_load_i32_with_const_offset( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray2, i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: ret i32 [[TMP3]] +; LE-LABEL: @gep_i32_load_i32_with_const_offset( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @gep_i32_load_i32_with_const_offset( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i16, ptr @constarray2, i64 1 %2 = getelementptr inbounds i32, ptr %1, i64 %idx @@ -84,10 +84,7 @@ define i32 @gep_i32_load_i32_const_ptr_array(i64 %idx){ ; CHECK-LABEL: @gep_i32_load_i32_const_ptr_array( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds ptr, ptr @constptrarray, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: ret i32 42 ; %1 = getelementptr inbounds ptr, ptr @constptrarray, i64 %idx %2 = load ptr, ptr %1, align 4 @@ -96,10 +93,11 @@ } define i32 @gep_i32_load_i32_align4_packedstruct(i64 %idx){ -; CHECK-LABEL: @gep_i32_load_i32_align4_packedstruct( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constpackedstruct, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: ret i32 [[TMP2]] +; LE-LABEL: @gep_i32_load_i32_align4_packedstruct( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @gep_i32_load_i32_align4_packedstruct( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i32, ptr @constpackedstruct, i64 %idx %2 = load i32, ptr %1, align 4 @@ -120,17 +118,14 @@ } define i32 @gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){ -; CHECK-LABEL: @gep_i32_load_i32_align4_struct_with_const_offset( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: ret i32 [[TMP3]] +; LE-LABEL: @gep_i32_load_i32_align4_struct_with_const_offset( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @gep_i32_load_i32_align4_struct_with_const_offset( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i16, ptr @conststruct, i64 1 %2 = getelementptr inbounds i32, ptr %1, i64 %idx %3 = load i32, ptr %2, align 4 ret i32 %3 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; BE: {{.*}} -; LE: {{.*}}