Index: llvm/lib/Target/WebAssembly/CMakeLists.txt =================================================================== --- llvm/lib/Target/WebAssembly/CMakeLists.txt +++ llvm/lib/Target/WebAssembly/CMakeLists.txt @@ -40,6 +40,7 @@ WebAssemblyMCInstLower.cpp WebAssemblyMCLowerPrePass.cpp WebAssemblyNullifyDebugValueLists.cpp + WebAssemblyOptimizeGEPs.cpp WebAssemblyOptimizeLiveIntervals.cpp WebAssemblyOptimizeReturned.cpp WebAssemblyPeephole.cpp Index: llvm/lib/Target/WebAssembly/WebAssembly.h =================================================================== --- llvm/lib/Target/WebAssembly/WebAssembly.h +++ llvm/lib/Target/WebAssembly/WebAssembly.h @@ -23,6 +23,7 @@ class WebAssemblyTargetMachine; class ModulePass; class FunctionPass; +class Pass; // LLVM IR passes. ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(); @@ -30,6 +31,7 @@ ModulePass *createWebAssemblyFixFunctionBitcasts(); FunctionPass *createWebAssemblyOptimizeReturned(); FunctionPass *createWebAssemblyLowerRefTypesIntPtrConv(); +Pass *createWebAssemblyOptimizeGEPs(); // ISel and immediate followup passes. FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM, @@ -82,6 +84,7 @@ void initializeWebAssemblyPeepholePass(PassRegistry &); void initializeWebAssemblyMCLowerPrePassPass(PassRegistry &); void initializeWebAssemblyLowerRefTypesIntPtrConvPass(PassRegistry &); +void initializeWebAssemblyOptimizeGEPsPass(PassRegistry &); namespace WebAssembly { enum TargetIndex { Index: llvm/lib/Target/WebAssembly/WebAssemblyOptimizeGEPs.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/WebAssembly/WebAssemblyOptimizeGEPs.cpp @@ -0,0 +1,237 @@ +//===--- WebAssemblyOptimizeGEPs.cpp - GetElementPtr index processing ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Optimize inbounds GetElemenPtrInst so that LoopStrengthReduce and +/// ScalarEvolutionExpander don't make modifications to the IR which lose the +/// inbounds information. +/// +/// To use immediate address offsets, the add operations need to be marked as +/// nuw due to WebAssembly's infinite precision address offset calculation. The +/// add operations are lowered from GetElemenPtrInst and, if it's inbounds, the +/// add will be nuw. Unfortunately, the inbounds information is easily lost when +/// converting between IR -> SCEV -> IR during LSR. This pass finds inbounds +/// GEPs, with a single immediate index and converts the base pointer to use +/// pointer arithmetic. It's this arithmetic that prevents SCEVExpander from +/// losing the nowrap / inbounds attributes. +/// +/// SeparateConstOffsetFromGEP and EarlyCSE are used to prepare the code for +/// this pass. +/// +/// As an example, a common pattern in an unrolled loop would be an or of an +/// indvar to generate an index for gep, eg: +/// +/// %index = or i32 %i, 1 +/// %gep = getelementptr inbounds i32, ptr %base, i32 %index +/// %data = load i32, ptr %gep, align 4 +/// +/// However, after LoopStrengthReduce the code is usually transformed into the +/// form which is less friendly for WebAssembly as we cannot lower non-inbound +/// geps to load/store with immediate offsets: +/// +/// %uglygep = getelementptr i8, ptr %base, i32 %lsr.iv +/// %uglygep2 = getelementptr i8, ptr %uglygep, i32 4 +/// %data = load i32, ptr %uglygep2, align 4 +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "wasm-optimize-geps" +#define PASS_DESC "Optimize inner-loop GEPs for Webassembly" + +using namespace llvm; + +static cl::opt DisableWebAssemblyOptimizeGEPs( + "disable-wasm-optimize-geps", cl::Hidden, + cl::desc("WebAssembly: Disable getelementptr optimizations."), + cl::init(false)); + +namespace { + +class RebaseCandidate { +public: + RebaseCandidate(GetElementPtrInst *GEP) : GEP(GEP) {} + + void update(Value *NewBasePtr) { + assert(NewBasePtr->getType()->isPointerTy() && "Expected PointerTy"); + GEP->setOperand(0, NewBasePtr); + } + + GetElementPtrInst *getGEP() const { return GEP; } + +#ifndef NDEBUG + void dump() { + LLVM_DEBUG(dbgs() << "Candidate:\n" + << " GEP: " << *GEP << "\n" + << " Base: " << *GEP->getPointerOperand() << "\n"); + } +#endif + + // - inbounds + // - gep base ptr + // - single, constant, index + static bool isCandidate(GetElementPtrInst *GEP) { + return GEP && GEP->isInBounds() && GEP->getNumIndices() == 1 && + isa(GEP->getOperand(0)) && + isa(GEP->getOperand(1)); + } + +private: + GetElementPtrInst *GEP; +}; + +class WebAssemblyOptimizeGEPs : public LoopPass { +public: + static char ID; + + WebAssemblyOptimizeGEPs() : LoopPass(ID) {} + +private: + StringRef getPassName() const override { return PASS_DESC; } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + bool run(Loop *L); + void addCandidate(GetElementPtrInst *GEP); + // Create a new base address using pointer casting. This should prevent LSR + // from messing with our inbound geps. + Value *createNewBaseAddr(GetElementPtrInst *BaseGEP, Loop *L); + + LLVMContext *Ctx = nullptr; + const DataLayout *DL = nullptr; + IntegerType *IntPtrTy = nullptr; + SmallVector BasePtrs; + DenseMap> Candidates; +}; + +} // end anonymous namespace + +char WebAssemblyOptimizeGEPs::ID = 0; +INITIALIZE_PASS(WebAssemblyOptimizeGEPs, DEBUG_TYPE, PASS_DESC, false, false) + +Pass *llvm::createWebAssemblyOptimizeGEPs() { + return new WebAssemblyOptimizeGEPs(); +} + +bool WebAssemblyOptimizeGEPs::runOnLoop(Loop *L, LPPassManager &) { + if (DisableWebAssemblyOptimizeGEPs) + return false; + if (skipLoop(L)) + return false; + if (!L->getLoopPreheader()) + return false; + + // LSR only operates on the inner most loops, so do the same. + if (!L->isInnermost()) { + LLVM_DEBUG(dbgs() << "WasmOptGep: Not inner most loop.\n"); + return false; + } + + if (L->getNumBlocks() > 1) { + LLVM_DEBUG(dbgs() << "WasmOptGep: Only handling single-block loops.\n"); + return false; + } + + Function *F = L->getLoopPreheader()->getParent(); + Ctx = &F->getContext(); + DL = &F->getParent()->getDataLayout(); + // Set the type we'll use for pointer arithmetic. + IntPtrTy = DL->getIntPtrType(*Ctx); + Candidates.clear(); + BasePtrs.clear(); + return run(L); +} + +void WebAssemblyOptimizeGEPs::addCandidate(GetElementPtrInst *GEP) { + assert(isa(GEP->getPointerOperand()) && + "Expected GEP base pointer"); + auto *BasePtr = cast(GEP->getPointerOperand()); + if (!Candidates.count(BasePtr)) + BasePtrs.push_back(BasePtr); + Candidates[BasePtr].emplace_back(GEP); +} + +Value *WebAssemblyOptimizeGEPs::createNewBaseAddr(GetElementPtrInst *BaseGEP, + Loop *L) { + Value *BasePtr = BaseGEP->getOperand(0); + Value *Index = BaseGEP->getOperand(1); + Type *BaseType = BasePtr->getType(); + IRBuilder<> Builder(*Ctx); + Builder.SetInsertPoint(BaseGEP); + + LLVM_DEBUG(dbgs() << "WasmOptGep: Creating new base addr.\n" + << " with base address: " << *BasePtr << "\n" + << " and index: " << *Index << "\n"); + + assert(BasePtr->getType()->isPointerTy() && "Expected PointerTy"); + assert(Index->getType() == IntPtrTy && "Expected matching IntegerTy"); + + // Scale the index if needed. + if (DL->getTypeAllocSize(BaseGEP->getSourceElementType()) != 1) { + APInt Size = APInt(IntPtrTy->getIntegerBitWidth(), + DL->getTypeAllocSize(BaseGEP->getSourceElementType())); + if (Size.isPowerOf2()) + Index = + Builder.CreateShl(Index, ConstantInt::get(IntPtrTy, Size.logBase2())); + else + Index = Builder.CreateMul(Index, ConstantInt::get(IntPtrTy, Size)); + } + + Value *PtrToInt = Builder.CreatePtrToInt(BasePtr, IntPtrTy); + Value *PtrArith = Builder.CreateAdd(PtrToInt, Index); + Value *NewBase = Builder.CreateIntToPtr(PtrArith, BaseType); + + // Update the base gep to use the new base register with a zero index. + BaseGEP->setOperand(0, NewBase); + BaseGEP->setOperand(1, ConstantInt::get(IntPtrTy, 0)); + + LLVM_DEBUG(dbgs() << " new base addr: " << *NewBase << "\n"); + LLVM_DEBUG(dbgs() << " updated base gep: " << *BaseGEP << "\n"); + return NewBase; +} + +bool WebAssemblyOptimizeGEPs::run(Loop *L) { + // Search the loop for all the GEPs. + assert(L->getNumBlocks() == 1 && "Expected single block loop"); + for (auto &I : *L->getHeader()) { + auto *GEP = dyn_cast(&I); + if (RebaseCandidate::isCandidate(GEP)) + addCandidate(GEP); + } + + if (Candidates.empty()) + return false; + + LLVM_DEBUG(dbgs() << "WasmOptGep: Found rebase candidates:\n"; + for (auto BasePtr + : BasePtrs) for (auto &Candidate + : Candidates[BasePtr]) Candidate.dump();); + + // Refactor the common base components into a new base address, updating each + // GEP to use it. + for (auto &BasePtr : BasePtrs) { + Value *NewBase = createNewBaseAddr(BasePtr, L); + for (auto &Candidate : Candidates[BasePtr]) + Candidate.update(NewBase); + } + + return true; +} Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -82,6 +82,7 @@ initializeWebAssemblyMCLowerPrePassPass(PR); initializeWebAssemblyLowerRefTypesIntPtrConvPass(PR); initializeWebAssemblyFixBrTableDefaultsPass(PR); + initializeWebAssemblyOptimizeGEPsPass(PR); } //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/WebAssembly/optimize-geps-wasm64.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/WebAssembly/optimize-geps-wasm64.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -enable-new-pm=0 -mtriple=wasm64 -separate-const-offset-from-gep -early-cse -wasm-optimize-geps -S %s -o - | FileCheck %s --check-prefix=CHECK +; RUN: opt -enable-new-pm=0 -mtriple=wasm64 -separate-const-offset-from-gep -early-cse -wasm-optimize-geps -loop-reduce -S %s -o - | FileCheck %s --check-prefix=AFTER-LSR + +target datalayout = "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" + +; Test that all the inbounds getelementptr instructions are using an immediate +; index, and that they are still inbounds, even after loop-reduce. + +define hidden void @unroll_twice(ptr nocapture noundef readonly %0, ptr nocapture noundef writeonly %1) { +; CHECK-LABEL: @unroll_twice( +; CHECK-NEXT: br label [[TMP4:%.*]] +; CHECK: 3: +; CHECK-NEXT: ret void +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP20:%.*]], [[TMP4]] ] +; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP0:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP1:%.*]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 0 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 1 +; CHECK-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 10000 +; CHECK-NEXT: br i1 [[TMP21]], label [[TMP3:%.*]], label [[TMP4]] +; +; AFTER-LSR-LABEL: @unroll_twice( +; AFTER-LSR-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP0:%.*]] to i64 +; AFTER-LSR-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP1:%.*]] to i64 +; AFTER-LSR-NEXT: br label [[TMP6:%.*]] +; AFTER-LSR: 5: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: 6: +; AFTER-LSR-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[TMP6]] ], [ 0, [[TMP2:%.*]] ] +; AFTER-LSR-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; AFTER-LSR-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 0 +; AFTER-LSR-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; AFTER-LSR-NEXT: [[TMP11:%.*]] = add i64 [[TMP4]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr +; AFTER-LSR-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 0 +; AFTER-LSR-NEXT: store i32 [[TMP10]], ptr [[TMP13]], align 4 +; AFTER-LSR-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 1 +; AFTER-LSR-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4 +; AFTER-LSR-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 1 +; AFTER-LSR-NEXT: store i32 [[TMP15]], ptr [[TMP16]], align 4 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV]], 8 +; AFTER-LSR-NEXT: [[TMP17:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 40000 +; AFTER-LSR-NEXT: br i1 [[TMP17]], label [[TMP5:%.*]], label [[TMP6]] +; + br label %4 + +3: ; preds = %4 + ret void + +4: ; preds = %4, %2 + %5 = phi i64 [ 0, %2 ], [ %13, %4 ] + %6 = getelementptr inbounds i32, ptr %0, i64 %5 + %7 = load i32, ptr %6, align 4 + %8 = getelementptr inbounds i32, ptr %1, i64 %5 + store i32 %7, ptr %8, align 4 + %9 = or i64 %5, 1 + %10 = getelementptr inbounds i32, ptr %0, i64 %9 + %11 = load i32, ptr %10, align 4 + %12 = getelementptr inbounds i32, ptr %1, i64 %9 + store i32 %11, ptr %12, align 4 + %13 = add nuw nsw i64 %5, 2 + %14 = icmp eq i64 %13, 10000 + br i1 %14, label %3, label %4 +} Index: llvm/test/CodeGen/WebAssembly/optimize-geps.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/WebAssembly/optimize-geps.ll @@ -0,0 +1,626 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -enable-new-pm=0 -mtriple=wasm32 -separate-const-offset-from-gep -early-cse -wasm-optimize-geps -S %s -o - | FileCheck %s --check-prefix=CHECK +; RUN: opt -enable-new-pm=0 -mtriple=wasm32 -separate-const-offset-from-gep -early-cse -wasm-optimize-geps -loop-reduce -S %s -o - | FileCheck %s --check-prefix=AFTER-LSR + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" + +; Test that all the inbounds getelementptr instructions are using an immediate +; index, and that they are still inbounds, even after loop-reduce. + +define hidden void @unroll_twice_i8(ptr nocapture noundef readonly %arg, ptr nocapture noundef writeonly %arg1) { +; CHECK-LABEL: @unroll_twice_i8( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[I]] +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to ptr +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[I5:%.*]] = load i8, ptr [[I4]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[I]] +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store i8 [[I5]], ptr [[I6]], align 1 +; CHECK-NEXT: [[I82:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 1 +; CHECK-NEXT: [[I9:%.*]] = load i8, ptr [[I82]], align 1 +; CHECK-NEXT: [[I104:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 1 +; CHECK-NEXT: store i8 [[I9]], ptr [[I104]], align 1 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +; AFTER-LSR-LABEL: @unroll_twice_i8( +; AFTER-LSR-NEXT: bb: +; AFTER-LSR-NEXT: [[ARG6:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; AFTER-LSR-NEXT: [[ARG15:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; AFTER-LSR-NEXT: br label [[BB3:%.*]] +; AFTER-LSR: bb2: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: bb3: +; AFTER-LSR-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; AFTER-LSR-NEXT: [[TMP0:%.*]] = add i32 [[ARG6]], [[I]] +; AFTER-LSR-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; AFTER-LSR-NEXT: [[I4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; AFTER-LSR-NEXT: [[I5:%.*]] = load i8, ptr [[I4]], align 1 +; AFTER-LSR-NEXT: [[TMP2:%.*]] = add i32 [[ARG15]], [[I]] +; AFTER-LSR-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; AFTER-LSR-NEXT: [[I6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; AFTER-LSR-NEXT: store i8 [[I5]], ptr [[I6]], align 1 +; AFTER-LSR-NEXT: [[I82:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 1 +; AFTER-LSR-NEXT: [[I9:%.*]] = load i8, ptr [[I82]], align 1 +; AFTER-LSR-NEXT: [[I104:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 1 +; AFTER-LSR-NEXT: store i8 [[I9]], ptr [[I104]], align 1 +; AFTER-LSR-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; AFTER-LSR-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; AFTER-LSR-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr inbounds i8, ptr %arg, i32 %i + %i5 = load i8, ptr %i4, align 1 + %i6 = getelementptr inbounds i8, ptr %arg1, i32 %i + store i8 %i5, ptr %i6, align 1 + %i7 = or i32 %i, 1 + %i8 = getelementptr inbounds i8, ptr %arg, i32 %i7 + %i9 = load i8, ptr %i8, align 1 + %i10 = getelementptr inbounds i8, ptr %arg1, i32 %i7 + store i8 %i9, ptr %i10, align 1 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} + +define hidden void @unroll_twice_i16(ptr nocapture noundef readonly %arg, ptr nocapture noundef writeonly %arg1) { +; CHECK-LABEL: @unroll_twice_i16( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[I]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[I5:%.*]] = load i16, ptr [[I4]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[I]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store i16 [[I5]], ptr [[I6]], align 2 +; CHECK-NEXT: [[I82:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 1 +; CHECK-NEXT: [[I9:%.*]] = load i16, ptr [[I82]], align 2 +; CHECK-NEXT: [[I104:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 1 +; CHECK-NEXT: store i16 [[I9]], ptr [[I104]], align 2 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +; AFTER-LSR-LABEL: @unroll_twice_i16( +; AFTER-LSR-NEXT: bb: +; AFTER-LSR-NEXT: [[ARG6:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; AFTER-LSR-NEXT: [[ARG15:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; AFTER-LSR-NEXT: br label [[BB3:%.*]] +; AFTER-LSR: bb2: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: bb3: +; AFTER-LSR-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ] +; AFTER-LSR-NEXT: [[TMP0:%.*]] = add i32 [[ARG6]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; AFTER-LSR-NEXT: [[I4:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; AFTER-LSR-NEXT: [[I5:%.*]] = load i16, ptr [[I4]], align 2 +; AFTER-LSR-NEXT: [[TMP2:%.*]] = add i32 [[ARG15]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; AFTER-LSR-NEXT: [[I6:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0 +; AFTER-LSR-NEXT: store i16 [[I5]], ptr [[I6]], align 2 +; AFTER-LSR-NEXT: [[I82:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 1 +; AFTER-LSR-NEXT: [[I9:%.*]] = load i16, ptr [[I82]], align 2 +; AFTER-LSR-NEXT: [[I104:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 1 +; AFTER-LSR-NEXT: store i16 [[I9]], ptr [[I104]], align 2 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV]], 4 +; AFTER-LSR-NEXT: [[I12:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 20000 +; AFTER-LSR-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr inbounds i16, ptr %arg, i32 %i + %i5 = load i16, ptr %i4, align 2 + %i6 = getelementptr inbounds i16, ptr %arg1, i32 %i + store i16 %i5, ptr %i6, align 2 + %i7 = or i32 %i, 1 + %i8 = getelementptr inbounds i16, ptr %arg, i32 %i7 + %i9 = load i16, ptr %i8, align 2 + %i10 = getelementptr inbounds i16, ptr %arg1, i32 %i7 + store i16 %i9, ptr %i10, align 2 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} + +define hidden void @unroll_twice_i32(ptr nocapture noundef readonly %arg, ptr nocapture noundef writeonly %arg1) { +; CHECK-LABEL: @unroll_twice_i32( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[I]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[I5:%.*]] = load i32, ptr [[I4]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[I]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; CHECK-NEXT: [[I82:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1 +; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[I82]], align 4 +; CHECK-NEXT: [[I104:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 +; CHECK-NEXT: store i32 [[I9]], ptr [[I104]], align 4 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +; AFTER-LSR-LABEL: @unroll_twice_i32( +; AFTER-LSR-NEXT: bb: +; AFTER-LSR-NEXT: [[ARG6:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; AFTER-LSR-NEXT: [[ARG15:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; AFTER-LSR-NEXT: br label [[BB3:%.*]] +; AFTER-LSR: bb2: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: bb3: +; AFTER-LSR-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ] +; AFTER-LSR-NEXT: [[TMP0:%.*]] = add i32 [[ARG6]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; AFTER-LSR-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; AFTER-LSR-NEXT: [[I5:%.*]] = load i32, ptr [[I4]], align 4 +; AFTER-LSR-NEXT: [[TMP2:%.*]] = add i32 [[ARG15]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; AFTER-LSR-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; AFTER-LSR-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; AFTER-LSR-NEXT: [[I82:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +; AFTER-LSR-NEXT: [[I9:%.*]] = load i32, ptr [[I82]], align 4 +; AFTER-LSR-NEXT: [[I104:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1 +; AFTER-LSR-NEXT: store i32 [[I9]], ptr [[I104]], align 4 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV]], 8 +; AFTER-LSR-NEXT: [[I12:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 40000 +; AFTER-LSR-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr inbounds i32, ptr %arg, i32 %i + %i5 = load i32, ptr %i4, align 4 + %i6 = getelementptr inbounds i32, ptr %arg1, i32 %i + store i32 %i5, ptr %i6, align 4 + %i7 = or i32 %i, 1 + %i8 = getelementptr inbounds i32, ptr %arg, i32 %i7 + %i9 = load i32, ptr %i8, align 4 + %i10 = getelementptr inbounds i32, ptr %arg1, i32 %i7 + store i32 %i9, ptr %i10, align 4 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} + +define hidden void @unroll_twice_i64(ptr nocapture noundef readonly %arg, ptr nocapture noundef writeonly %arg1) { +; CHECK-LABEL: @unroll_twice_i64( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[I]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[I5:%.*]] = load i64, ptr [[I4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[I]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store i64 [[I5]], ptr [[I6]], align 8 +; CHECK-NEXT: [[I82:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 1 +; CHECK-NEXT: [[I9:%.*]] = load i64, ptr [[I82]], align 8 +; CHECK-NEXT: [[I104:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 1 +; CHECK-NEXT: store i64 [[I9]], ptr [[I104]], align 8 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +; AFTER-LSR-LABEL: @unroll_twice_i64( +; AFTER-LSR-NEXT: bb: +; AFTER-LSR-NEXT: [[ARG6:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; AFTER-LSR-NEXT: [[ARG15:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; AFTER-LSR-NEXT: br label [[BB3:%.*]] +; AFTER-LSR: bb2: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: bb3: +; AFTER-LSR-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ] +; AFTER-LSR-NEXT: [[TMP0:%.*]] = add i32 [[ARG6]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; AFTER-LSR-NEXT: [[I4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; AFTER-LSR-NEXT: [[I5:%.*]] = load i64, ptr [[I4]], align 8 +; AFTER-LSR-NEXT: [[TMP2:%.*]] = add i32 [[ARG15]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; AFTER-LSR-NEXT: [[I6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 +; AFTER-LSR-NEXT: store i64 [[I5]], ptr [[I6]], align 8 +; AFTER-LSR-NEXT: [[I82:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 1 +; AFTER-LSR-NEXT: [[I9:%.*]] = load i64, ptr [[I82]], align 8 +; AFTER-LSR-NEXT: [[I104:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 1 +; AFTER-LSR-NEXT: store i64 [[I9]], ptr [[I104]], align 8 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV]], 16 +; AFTER-LSR-NEXT: [[I12:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 80000 +; AFTER-LSR-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr inbounds i64, ptr %arg, i32 %i + %i5 = load i64, ptr %i4, align 8 + %i6 = getelementptr inbounds i64, ptr %arg1, i32 %i + store i64 %i5, ptr %i6, align 8 + %i7 = or i32 %i, 1 + %i8 = getelementptr inbounds i64, ptr %arg, i32 %i7 + %i9 = load i64, ptr %i8, align 8 + %i10 = getelementptr inbounds i64, ptr %arg1, i32 %i7 + store i64 %i9, ptr %i10, align 8 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} + +define hidden void @no_inbound_loads(ptr nocapture noundef readonly %arg, ptr nocapture noundef writeonly %arg1) { +; CHECK-LABEL: @no_inbound_loads( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[I4:%.*]] = getelementptr i32, ptr [[ARG:%.*]], i32 [[I]] +; CHECK-NEXT: [[I5:%.*]] = load i32, ptr [[I4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[I]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; CHECK-NEXT: [[I82:%.*]] = getelementptr i32, ptr [[I4]], i32 1 +; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[I82]], align 4 +; CHECK-NEXT: [[I104:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1 +; CHECK-NEXT: store i32 [[I9]], ptr [[I104]], align 4 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +; AFTER-LSR-LABEL: @no_inbound_loads( +; AFTER-LSR-NEXT: bb: +; AFTER-LSR-NEXT: [[ARG15:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; AFTER-LSR-NEXT: br label [[BB3:%.*]] +; AFTER-LSR: bb2: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: bb3: +; AFTER-LSR-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ] +; AFTER-LSR-NEXT: [[UGLYGEP7:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i32 [[LSR_IV]] +; AFTER-LSR-NEXT: [[I5:%.*]] = load i32, ptr [[UGLYGEP7]], align 4 +; AFTER-LSR-NEXT: [[TMP0:%.*]] = add i32 [[ARG15]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; AFTER-LSR-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; AFTER-LSR-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; AFTER-LSR-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ARG]], i32 [[LSR_IV]] +; AFTER-LSR-NEXT: [[UGLYGEP6:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i32 4 +; AFTER-LSR-NEXT: [[I9:%.*]] = load i32, ptr [[UGLYGEP6]], align 4 +; AFTER-LSR-NEXT: [[I104:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +; AFTER-LSR-NEXT: store i32 [[I9]], ptr [[I104]], align 4 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV]], 8 +; AFTER-LSR-NEXT: [[I12:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 40000 +; AFTER-LSR-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr i32, ptr %arg, i32 %i + %i5 = load i32, ptr %i4, align 4 + %i6 = getelementptr inbounds i32, ptr %arg1, i32 %i + store i32 %i5, ptr %i6, align 4 + %i7 = or i32 %i, 1 + %i8 = getelementptr i32, ptr %arg, i32 %i7 + %i9 = load i32, ptr %i8, align 4 + %i10 = getelementptr inbounds i32, ptr %arg1, i32 %i7 + store i32 %i9, ptr %i10, align 4 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} + +define hidden void @runtime_unroll(ptr nocapture noundef readonly %arg, ptr nocapture noundef readonly %arg1, ptr nocapture noundef writeonly %arg2, i32 noundef %arg3) { +; CHECK-LABEL: @runtime_unroll( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[I:%.*]] = icmp eq i32 [[ARG3:%.*]], 0 +; CHECK-NEXT: br i1 [[I]], label [[BB19:%.*]], label [[BB4:%.*]] +; CHECK: bb4: +; CHECK-NEXT: [[I5:%.*]] = and i32 [[ARG3]], 1 +; CHECK-NEXT: [[I6:%.*]] = icmp eq i32 [[ARG3]], 1 +; CHECK-NEXT: br i1 [[I6]], label [[BB9:%.*]], label [[BB7:%.*]] +; CHECK: bb7: +; CHECK-NEXT: [[I8:%.*]] = and i32 [[ARG3]], -2 +; CHECK-NEXT: br label [[BB20:%.*]] +; CHECK: bb9: +; CHECK-NEXT: [[I10:%.*]] = phi i32 [ 0, [[BB4]] ], [ [[I36:%.*]], [[BB20]] ] +; CHECK-NEXT: [[I11:%.*]] = icmp eq i32 [[I5]], 0 +; CHECK-NEXT: br i1 [[I11]], label [[BB19]], label [[BB12:%.*]] +; CHECK: bb12: +; CHECK-NEXT: [[I13:%.*]] = getelementptr inbounds float, ptr [[ARG:%.*]], i32 [[I10]] +; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[I13]], align 4 +; CHECK-NEXT: [[I15:%.*]] = getelementptr inbounds float, ptr [[ARG1:%.*]], i32 [[I10]] +; CHECK-NEXT: [[I16:%.*]] = load float, ptr [[I15]], align 4 +; CHECK-NEXT: [[I17:%.*]] = fadd float [[I14]], [[I16]] +; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds float, ptr [[ARG2:%.*]], i32 [[I10]] +; CHECK-NEXT: store float [[I17]], ptr [[I18]], align 4 +; CHECK-NEXT: br label [[BB19]] +; CHECK: bb19: +; CHECK-NEXT: ret void +; CHECK: bb20: +; CHECK-NEXT: [[I21:%.*]] = phi i32 [ 0, [[BB7]] ], [ [[I36]], [[BB20]] ] +; CHECK-NEXT: [[I22:%.*]] = phi i32 [ 0, [[BB7]] ], [ [[I37:%.*]], [[BB20]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[I21]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[ARG]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; CHECK-NEXT: [[I23:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[I24:%.*]] = load float, ptr [[I23]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[I21]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ARG1]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr +; CHECK-NEXT: [[I25:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[I26:%.*]] = load float, ptr [[I25]], align 4 +; CHECK-NEXT: [[I27:%.*]] = fadd float [[I24]], [[I26]] +; CHECK-NEXT: [[TMP8:%.*]] = shl i32 [[I21]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARG2]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr +; CHECK-NEXT: [[I28:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; CHECK-NEXT: store float [[I27]], ptr [[I28]], align 4 +; CHECK-NEXT: [[I302:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1 +; CHECK-NEXT: [[I31:%.*]] = load float, ptr [[I302]], align 4 +; CHECK-NEXT: [[I324:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 1 +; CHECK-NEXT: [[I33:%.*]] = load float, ptr [[I324]], align 4 +; CHECK-NEXT: [[I34:%.*]] = fadd float [[I31]], [[I33]] +; CHECK-NEXT: [[I356:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 1 +; CHECK-NEXT: store float [[I34]], ptr [[I356]], align 4 +; CHECK-NEXT: [[I36]] = add nuw i32 [[I21]], 2 +; CHECK-NEXT: [[I37]] = add i32 [[I22]], 2 +; CHECK-NEXT: [[I38:%.*]] = icmp eq i32 [[I37]], [[I8]] +; CHECK-NEXT: br i1 [[I38]], label [[BB9]], label [[BB20]] +; +; AFTER-LSR-LABEL: @runtime_unroll( +; AFTER-LSR-NEXT: bb: +; AFTER-LSR-NEXT: [[ARG11:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; AFTER-LSR-NEXT: [[ARG18:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; AFTER-LSR-NEXT: [[ARG27:%.*]] = ptrtoint ptr [[ARG2:%.*]] to i32 +; AFTER-LSR-NEXT: [[I:%.*]] = icmp eq i32 [[ARG3:%.*]], 0 +; AFTER-LSR-NEXT: br i1 [[I]], label [[BB19:%.*]], label [[BB4:%.*]] +; AFTER-LSR: bb4: +; AFTER-LSR-NEXT: [[I5:%.*]] = and i32 [[ARG3]], 1 +; AFTER-LSR-NEXT: [[I6:%.*]] = icmp eq i32 [[ARG3]], 1 +; AFTER-LSR-NEXT: br i1 [[I6]], label [[BB9:%.*]], label [[BB7:%.*]] +; AFTER-LSR: bb7: +; AFTER-LSR-NEXT: [[I8:%.*]] = and i32 [[ARG3]], -2 +; AFTER-LSR-NEXT: br label [[BB20:%.*]] +; AFTER-LSR: bb9.loopexit: +; AFTER-LSR-NEXT: br label [[BB9]] +; AFTER-LSR: bb9: +; AFTER-LSR-NEXT: [[I10:%.*]] = phi i32 [ 0, [[BB4]] ], [ [[I36:%.*]], [[BB9_LOOPEXIT:%.*]] ] +; AFTER-LSR-NEXT: [[I11:%.*]] = icmp eq i32 [[I5]], 0 +; AFTER-LSR-NEXT: br i1 [[I11]], label [[BB19]], label [[BB12:%.*]] +; AFTER-LSR: bb12: +; AFTER-LSR-NEXT: [[I13:%.*]] = getelementptr inbounds float, ptr [[ARG]], i32 [[I10]] +; AFTER-LSR-NEXT: [[I14:%.*]] = load float, ptr [[I13]], align 4 +; AFTER-LSR-NEXT: [[I15:%.*]] = getelementptr inbounds float, ptr [[ARG1]], i32 [[I10]] +; AFTER-LSR-NEXT: [[I16:%.*]] = load float, ptr [[I15]], align 4 +; AFTER-LSR-NEXT: [[I17:%.*]] = fadd float [[I14]], [[I16]] +; AFTER-LSR-NEXT: [[I18:%.*]] = getelementptr inbounds float, ptr [[ARG2]], i32 [[I10]] +; AFTER-LSR-NEXT: store float [[I17]], ptr [[I18]], align 4 +; AFTER-LSR-NEXT: br label [[BB19]] +; AFTER-LSR: bb19: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: bb20: +; AFTER-LSR-NEXT: [[LSR_IV12:%.*]] = phi i32 [ [[LSR_IV_NEXT13:%.*]], [[BB20]] ], [ [[ARG11]], [[BB7]] ] +; AFTER-LSR-NEXT: [[LSR_IV9:%.*]] = phi i32 [ [[LSR_IV_NEXT10:%.*]], [[BB20]] ], [ [[ARG18]], [[BB7]] ] +; AFTER-LSR-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB20]] ], [ [[ARG27]], [[BB7]] ] +; AFTER-LSR-NEXT: [[I21:%.*]] = phi i32 [ 0, [[BB7]] ], [ [[I36]], [[BB20]] ] +; AFTER-LSR-NEXT: [[TMP0:%.*]] = inttoptr i32 [[LSR_IV12]] to ptr +; AFTER-LSR-NEXT: [[I23:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; AFTER-LSR-NEXT: [[I24:%.*]] = load float, ptr [[I23]], align 4 +; AFTER-LSR-NEXT: [[TMP1:%.*]] = inttoptr i32 [[LSR_IV9]] to ptr +; AFTER-LSR-NEXT: [[I25:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; AFTER-LSR-NEXT: [[I26:%.*]] = load float, ptr [[I25]], align 4 +; AFTER-LSR-NEXT: [[I27:%.*]] = fadd float [[I24]], [[I26]] +; AFTER-LSR-NEXT: [[TMP2:%.*]] = inttoptr i32 [[LSR_IV]] to ptr +; AFTER-LSR-NEXT: [[I28:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; AFTER-LSR-NEXT: store float [[I27]], ptr [[I28]], align 4 +; AFTER-LSR-NEXT: [[I302:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1 +; AFTER-LSR-NEXT: [[I31:%.*]] = load float, ptr [[I302]], align 4 +; AFTER-LSR-NEXT: [[I324:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +; AFTER-LSR-NEXT: [[I33:%.*]] = load float, ptr [[I324]], align 4 +; AFTER-LSR-NEXT: [[I34:%.*]] = fadd float [[I31]], [[I33]] +; AFTER-LSR-NEXT: [[I356:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 1 +; AFTER-LSR-NEXT: store float [[I34]], ptr [[I356]], align 4 +; AFTER-LSR-NEXT: [[I36]] = add i32 [[I21]], 2 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT10]] = add i32 [[LSR_IV9]], 8 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT13]] = add i32 [[LSR_IV12]], 8 +; AFTER-LSR-NEXT: [[I38:%.*]] = icmp eq i32 [[I8]], [[I36]] +; AFTER-LSR-NEXT: br i1 [[I38]], label [[BB9_LOOPEXIT]], label [[BB20]] +; +bb: + %i = icmp eq i32 %arg3, 0 + br i1 %i, label %bb19, label %bb4 + +bb4: ; preds = %bb + %i5 = and i32 %arg3, 1 + %i6 = icmp eq i32 %arg3, 1 + br i1 %i6, label %bb9, label %bb7 + +bb7: ; preds = %bb4 + %i8 = and i32 %arg3, -2 + br label %bb20 + +bb9: ; preds = %bb20, %bb4 + %i10 = phi i32 [ 0, %bb4 ], [ %i36, %bb20 ] + %i11 = icmp eq i32 %i5, 0 + br i1 %i11, label %bb19, label %bb12 + +bb12: ; preds = %bb9 + %i13 = getelementptr inbounds float, ptr %arg, i32 %i10 + %i14 = load float, ptr %i13, align 4 + %i15 = getelementptr inbounds float, ptr %arg1, i32 %i10 + %i16 = load float, ptr %i15, align 4 + %i17 = fadd float %i14, %i16 + %i18 = getelementptr inbounds float, ptr %arg2, i32 %i10 + store float %i17, ptr %i18, align 4 + br label %bb19 + +bb19: ; preds = %bb12, %bb9, %bb + ret void + +bb20: ; preds = %bb20, %bb7 + %i21 = phi i32 [ 0, %bb7 ], [ %i36, %bb20 ] + %i22 = phi i32 [ 0, %bb7 ], [ %i37, %bb20 ] + %i23 = getelementptr inbounds float, ptr %arg, i32 %i21 + %i24 = load float, ptr %i23, align 4 + %i25 = getelementptr inbounds float, ptr %arg1, i32 %i21 + %i26 = load float, ptr %i25, align 4 + %i27 = fadd float %i24, %i26 + %i28 = getelementptr inbounds float, ptr %arg2, i32 %i21 + store float %i27, ptr %i28, align 4 + %i29 = or i32 %i21, 1 + %i30 = getelementptr inbounds float, ptr %arg, i32 %i29 + %i31 = load float, ptr %i30, align 4 + %i32 = getelementptr inbounds float, ptr %arg1, i32 %i29 + %i33 = load float, ptr %i32, align 4 + %i34 = fadd float %i31, %i33 + %i35 = getelementptr inbounds float, ptr %arg2, i32 %i29 + store float %i34, ptr %i35, align 4 + %i36 = add nuw i32 %i21, 2 + %i37 = add i32 %i22, 2 + %i38 = icmp eq i32 %i37, %i8 + br i1 %i38, label %bb9, label %bb20 +} + +@base_ptr = external global ptr + +define hidden void @global_base(ptr nocapture noundef writeonly %arg) { +; CHECK-LABEL: @global_base( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[I]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 ptrtoint (ptr @base_ptr to i32), [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to ptr +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[I5:%.*]] = load i32, ptr [[I4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[I]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; CHECK-NEXT: [[I82:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 +; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[I82]], align 4 +; CHECK-NEXT: [[I104:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 +; CHECK-NEXT: store i32 [[I9]], ptr [[I104]], align 4 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +; AFTER-LSR-LABEL: @global_base( +; AFTER-LSR-NEXT: bb: +; AFTER-LSR-NEXT: [[ARG5:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; AFTER-LSR-NEXT: br label [[BB3:%.*]] +; AFTER-LSR: bb2: +; AFTER-LSR-NEXT: ret void +; AFTER-LSR: bb3: +; AFTER-LSR-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ] +; AFTER-LSR-NEXT: [[TMP0:%.*]] = add i32 [[LSR_IV]], ptrtoint (ptr @base_ptr to i32) +; AFTER-LSR-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; AFTER-LSR-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; AFTER-LSR-NEXT: [[I5:%.*]] = load i32, ptr [[I4]], align 4 +; AFTER-LSR-NEXT: [[TMP2:%.*]] = add i32 [[ARG5]], [[LSR_IV]] +; AFTER-LSR-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; AFTER-LSR-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; AFTER-LSR-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; AFTER-LSR-NEXT: [[I82:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +; AFTER-LSR-NEXT: [[I9:%.*]] = load i32, ptr [[I82]], align 4 +; AFTER-LSR-NEXT: [[I104:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1 +; AFTER-LSR-NEXT: store i32 [[I9]], ptr [[I104]], align 4 +; AFTER-LSR-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV]], 8 +; AFTER-LSR-NEXT: [[I12:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 40000 +; AFTER-LSR-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr inbounds i32, ptr @base_ptr, i32 %i + %i5 = load i32, ptr %i4, align 4 + %i6 = getelementptr inbounds i32, ptr %arg, i32 %i + store i32 %i5, ptr %i6, align 4 + %i7 = or i32 %i, 1 + %i8 = getelementptr inbounds i32, ptr @base_ptr, i32 %i7 + %i9 = load i32, ptr %i8, align 4 + %i10 = getelementptr inbounds i32, ptr %arg, i32 %i7 + store i32 %i9, ptr %i10, align 4 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} Index: llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn =================================================================== --- llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn +++ llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn @@ -57,6 +57,7 @@ "WebAssemblyMachineFunctionInfo.cpp", "WebAssemblyMemIntrinsicResults.cpp", "WebAssemblyNullifyDebugValueLists.cpp", + "WebAssemblyOptimizeGEPs.cpp", "WebAssemblyOptimizeLiveIntervals.cpp", "WebAssemblyOptimizeReturned.cpp", "WebAssemblyPeephole.cpp",