Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -354,6 +354,148 @@ return nullptr; } +struct PointerOffsetPair { + Value *Pointer; + uint64_t Offset; +}; + +static PointerOffsetPair getPointerOffsetPair(const DataLayout *DL, + LoadInst &LI) { + PointerOffsetPair POP; + if (auto *GEP = dyn_cast(LI.getPointerOperand())) { + POP.Pointer = GEP->getPointerOperand(); + unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType()); + APInt Offset(BitWidth, 0); + if (GEP->accumulateConstantOffset(*DL, Offset)) { + POP.Offset = Offset.getZExtValue(); + return POP; + } + // Can't handle GEPs with variable indices. + POP.Pointer = nullptr; + return POP; + } + POP.Pointer = LI.getPointerOperand(); + POP.Offset = 0; + return POP; +} + +/// \brief Are these two loads from the same pointer with different constant +/// offsets. +static bool areMaybeCombinable(const DataLayout *DL, LoadInst &A, LoadInst &B) { + if (A.getAlignment() != B.getAlignment() || A.getType() != B.getType()) + return false; + auto PA = getPointerOffsetPair(DL, A), PB = getPointerOffsetPair(DL, B); + if (PA.Pointer == PB.Pointer && PA.Offset != PB.Offset) + return true; + return false; +} + +static Value *extractInteger(const DataLayout &DL, InstCombiner::BuilderTy &IRB, + Value *V, IntegerType *Ty, uint64_t Offset, + const Twine &Name) { + DEBUG(dbgs() << " start: " << *V << "\n"); + IntegerType *IntTy = cast(V->getType()); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element extends past full value"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) { + V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); + DEBUG(dbgs() << " shifted: " << *V << "\n"); + } + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot extract to a larger integer!"); + if (Ty != IntTy) { + V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); + DEBUG(dbgs() << " trunced: " << *V << "\n"); + } + return V; +} + +/// \brief Scan forward to find all loads that can be combined into the current +/// load. Stopping at unsafe instructions. +static void instCombineLoadCombine(InstCombiner &IC, LoadInst &LI, + const DataLayout *DL) { + if (!DL) + return; + if (!LI.isSimple()) + return; + + struct LoadPOPPair { + LoadPOPPair(LoadInst * L, PointerOffsetPair P) : Load(L), POP(P) {} + LoadInst *Load; + PointerOffsetPair POP; + bool operator<(LoadPOPPair Other) const { + return POP.Offset < Other.POP.Offset; + } + }; + + SmallVector Loads; + Loads.push_back(LoadPOPPair(&LI, getPointerOffsetPair(DL, LI))); + + BasicBlock::iterator BBI = &LI; + unsigned RangeLimit = 10; + for (auto BBE = LI.getParent()->end(); BBI != BBE && RangeLimit; + ++BBI, --RangeLimit) { + if (isa(BBI) || isa(BBI)) + break; + if (LoadInst *NextLoad = dyn_cast(BBI)) { + if (!NextLoad->isSimple()) + continue; + if (!areMaybeCombinable(DL, LI, *NextLoad)) + continue; + Loads.push_back( + LoadPOPPair(NextLoad, getPointerOffsetPair(DL, *NextLoad))); + RangeLimit = 10; + } + } + std::sort(Loads.begin(), Loads.end()); + + // Handle the REALLY simple case for now. + if (Loads.size() < 2 || !isPowerOf2_64(Loads.size()) || + !LI.getType()->isIntegerTy(8)) + return; + + // Make sure the offsets are consecutive. + PointerOffsetPair POP; + POP.Offset = -1ull; + for (const auto &L : Loads) { + if (POP.Offset == -1ull) { + POP = L.POP; + continue; + } + if (L.POP.Offset != POP.Offset + 1) + return; + POP = L.POP; + } + + DEBUG(dbgs() << "***** Combining Loads ******\n"); + DEBUG({ + for (const auto &L : Loads) { + dbgs() << L.POP.Offset << ": "; + L.Load->dump(); + } + }); + + unsigned LoadSize = Loads.size() * LI.getType()->getIntegerBitWidth(); + Value *Ptr = Loads[0].Load->getPointerOperand(); + LoadInst *NewLoad = new LoadInst( + CastInst::CreatePointerCast( + Ptr, PointerType::get(IntegerType::get(LI.getContext(), LoadSize), + Ptr->getType()->getPointerAddressSpace()), + "", &LI), + Twine(LI.getName()) + ".combined", false, 1, &LI); + + for (const auto &L : Loads) { + Value *V = extractInteger( + *DL, *IC.Builder, NewLoad, cast(LI.getType()), + L.POP.Offset - Loads[0].POP.Offset, "combine.extract"); + IC.Worklist.AddValue(V); + L.Load->replaceAllUsesWith(V); + } +} + Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { Value *Op = LI.getOperand(0); @@ -460,6 +602,7 @@ } } } + instCombineLoadCombine(*this, LI, DL); return nullptr; } Index: test/Transforms/InstCombine/load-combine.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/load-combine.ll @@ -0,0 +1,48 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i64 @LoadU64_x64_0(i64* nocapture readonly %pData) #0 { + %1 = bitcast i64* %pData to i8* + %2 = load i8* %1, align 1 + %3 = zext i8 %2 to i64 + %4 = shl nuw i64 %3, 56 + %5 = getelementptr inbounds i8* %1, i64 1 + %6 = load i8* %5, align 1 + %7 = zext i8 %6 to i64 + %8 = shl nuw nsw i64 %7, 48 + %9 = or i64 %8, %4 + %10 = getelementptr inbounds i8* %1, i64 2 + %11 = load i8* %10, align 1 + %12 = zext i8 %11 to i64 + %13 = shl nuw nsw i64 %12, 40 + %14 = or i64 %9, %13 + %15 = getelementptr inbounds i8* %1, i64 3 + %16 = load i8* %15, align 1 + %17 = zext i8 %16 to i64 + %18 = shl nuw nsw i64 %17, 32 + %19 = or i64 %14, %18 + %20 = getelementptr inbounds i8* %1, i64 4 + %21 = load i8* %20, align 1 + %22 = zext i8 %21 to i64 + %23 = shl nuw nsw i64 %22, 24 + %24 = or i64 %19, %23 + %25 = getelementptr inbounds i8* %1, i64 5 + %26 = load i8* %25, align 1 + %27 = zext i8 %26 to i64 + %28 = shl nuw nsw i64 %27, 16 + %29 = or i64 %24, %28 + %30 = getelementptr inbounds i8* %1, i64 6 + %31 = load i8* %30, align 1 + %32 = zext i8 %31 to i64 + %33 = shl nuw nsw i64 %32, 8 + %34 = or i64 %29, %33 + %35 = getelementptr inbounds i8* %1, i64 7 + %36 = load i8* %35, align 1 + %37 = zext i8 %36 to i64 + %38 = or i64 %34, %37 + ret i64 %38 +; CHECK: load i64* +; CHECK-NOT: load +}