diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -305,7 +306,7 @@ Value *MulOp0; // Matching "(i * 0x01010101...) >> 24". if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) && - match(Op1, m_SpecificInt(MaskShift))) { + match(Op1, m_SpecificInt(MaskShift))) { Value *ShiftOp0; // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)". if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)), @@ -401,8 +402,8 @@ /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids /// pessimistic codegen that has to account for setting errno and can enable /// vectorization. -static bool -foldSqrt(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI) { +static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI, + TargetLibraryInfo &TLI) { // Match a call to sqrt mathlib function. auto *Call = dyn_cast(&I); if (!Call) @@ -824,6 +825,58 @@ return true; } +/// If C is a constant patterned array and all valid loaded results for given +/// alignment are same to a constant, return that constant. +static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { + auto *LI = dyn_cast(&I); + if (!LI || LI->isVolatile()) + return false; + + // We can only fold the load if it is from a constant global with definitive + // initializer. Skip expensive logic if this is not the case. + auto *PtrOp = LI->getPointerOperand(); + auto *GV = dyn_cast(getUnderlyingObject(PtrOp)); + if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) + return false; + + Type *LoadTy = LI->getType(); + Constant *C = GV->getInitializer(); + + // Bail for large initializers in excess of 4K to avoid too many scans. + uint64_t GVSize = DL.getTypeAllocSize(C->getType()); + if (!GVSize || 4096 < GVSize) + return false; + + // Check whether pointer arrives back at Global Variable. + // If PtrOp is neither GlobalVariable nor GEP, it might not arrive back at + // GlobalVariable. + // TODO: implement GEP handling + unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType()); + // TODO: Determine stride based on GEPs. + APInt Stride(BW, 1); + APInt ConstOffset(BW, 0); + + // Any possible offset could be multiple of GEP stride. And any valid + // offset is multiple of load alignment, so checking only multiples of bigger + // one is sufficient to say results' equality. + if (auto LA = LI->getAlign(); + LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value()) + Stride = APInt(BW, LA.value()); + + Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL); + if (!Ca) + return false; + + unsigned E = GVSize - DL.getTypeStoreSize(LoadTy); + for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride) + if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL)) + return false; + + I.replaceAllUsesWith(Ca); + + return true; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -850,6 +903,7 @@ MadeChange |= tryToFPToSat(I, TTI); MadeChange |= tryToRecognizeTableBasedCttz(I); MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA); + MadeChange |= foldPatternedLoads(I, DL); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make // bugs. diff --git a/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll --- a/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll @@ -12,12 +12,9 @@ @constpackedstruct = internal constant <{[8 x i8]}> <{[8 x i8] c"\01\00\01\00\01\00\01\00"}>, align 4 @conststruct = internal constant {i16, [8 x i8]} {i16 1, [8 x i8] c"\01\00\01\00\01\00\01\00"}, align 4 -; TODO: this will be ret i8 1 define i8 @inbounds_gep_load_i8_align2(i64 %idx){ ; CHECK-LABEL: @inbounds_gep_load_i8_align2( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray1, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 2 -; CHECK-NEXT: ret i8 [[TMP2]] +; CHECK-NEXT: ret i8 1 ; %1 = getelementptr inbounds i8, ptr @constarray1, i64 %idx %2 = load i8, ptr %1, align 2 @@ -53,10 +50,7 @@ ; can't be folded because ptrmask can change ptr, while preserving provenance define i8 @inbounds_gep_load_i8_align2_ptrmasked(i64 %idx, i64 %mask){ ; CHECK-LABEL: @inbounds_gep_load_i8_align2_ptrmasked( -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 [[MASK:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 2 -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: ret i8 1 ; %1 = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 %mask) %2 = getelementptr inbounds i8, ptr %1, i64 %idx @@ -102,13 +96,12 @@ ret i32 %3 } -; TODO: this coould be folded into 65537(LE), 16777472(BE) define i32 @gep_load_i32_align2_const_offset(i64 %idx){ -; CHECK-LABEL: @gep_load_i32_align2_const_offset( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr @constarray1, i64 -2 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [3 x i16], ptr [[TMP1]], i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 2 -; CHECK-NEXT: ret i32 [[TMP3]] +; LE-LABEL: @gep_load_i32_align2_const_offset( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @gep_load_i32_align2_const_offset( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr i16, ptr @constarray1, i64 -2 %2 = getelementptr [3 x i16], ptr %1, i64 %idx @@ -146,12 +139,12 @@ ret i32 %3 } -; TODO: this coould be folded into 65537(LE), 16777472(BE) define i32 @inbounds_gep_i32_load_i32_align4_packedstruct(i64 %idx){ -; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constpackedstruct, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: ret i32 [[TMP2]] +; LE-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct( +; BE-NEXT: ret i32 16777472 ; %1 = getelementptr inbounds i32, ptr @constpackedstruct, i64 %idx %2 = load i32, ptr %1, align 4 @@ -172,11 +165,14 @@ ; TODO: this coould be folded into 65537(LE), 16777472(BE) define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){ -; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: ret i32 [[TMP3]] +; LE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset( +; LE-NEXT: ret i32 65537 +; +; BE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset( +; BE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1 +; BE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]] +; BE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +; BE-NEXT: ret i32 [[TMP3]] ; %1 = getelementptr inbounds i16, ptr @conststruct, i64 1 %2 = getelementptr inbounds i32, ptr %1, i64 %idx @@ -184,6 +180,3 @@ ret i32 %3 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; BE: {{.*}} -; LE: {{.*}}