Index: llvm/lib/Target/WebAssembly/CMakeLists.txt =================================================================== --- llvm/lib/Target/WebAssembly/CMakeLists.txt +++ llvm/lib/Target/WebAssembly/CMakeLists.txt @@ -40,6 +40,7 @@ WebAssemblyMCInstLower.cpp WebAssemblyMCLowerPrePass.cpp WebAssemblyNullifyDebugValueLists.cpp + WebAssemblyOptimizeGEPs.cpp WebAssemblyOptimizeLiveIntervals.cpp WebAssemblyOptimizeReturned.cpp WebAssemblyPeephole.cpp Index: llvm/lib/Target/WebAssembly/WebAssembly.h =================================================================== --- llvm/lib/Target/WebAssembly/WebAssembly.h +++ llvm/lib/Target/WebAssembly/WebAssembly.h @@ -23,6 +23,7 @@ class WebAssemblyTargetMachine; class ModulePass; class FunctionPass; +class Pass; // LLVM IR passes. ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(); @@ -30,6 +31,7 @@ ModulePass *createWebAssemblyFixFunctionBitcasts(); FunctionPass *createWebAssemblyOptimizeReturned(); FunctionPass *createWebAssemblyLowerRefTypesIntPtrConv(); +Pass *createWebAssemblyOptimizeGEPs(); // ISel and immediate followup passes. FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM, @@ -82,6 +84,7 @@ void initializeWebAssemblyPeepholePass(PassRegistry &); void initializeWebAssemblyMCLowerPrePassPass(PassRegistry &); void initializeWebAssemblyLowerRefTypesIntPtrConvPass(PassRegistry &); +void initializeWebAssemblyOptimizeGEPsPass(PassRegistry &); namespace WebAssembly { enum TargetIndex { Index: llvm/lib/Target/WebAssembly/WebAssemblyOptimizeGEPs.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/WebAssembly/WebAssemblyOptimizeGEPs.cpp @@ -0,0 +1,263 @@ +//===--- WebAssemblyOptimizeGEPs.cpp - GetElementPtr index processing ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Optimize inbounds GetElemenPtr instructions so that LoopStrengthReduce and +/// ScalarEvolutionExpander don't make modifications to the IR which loose the +/// inbounds information. +/// +/// To use immediate address offsets, the add operations need to be marked as +/// nuw due to WebAssemblys infinite precision address offset calculation. +/// Unfortunately, the inbounds information is easily lost when converting +/// between IR -> SCEV -> IR during LSR. This pass finds inbounds GEPs, with the +/// form: (getelementptr %base, (or %reg_offset, %constant)) which is what is +/// generated after loop unrolling + instcombine. The GEPs are then converted +/// to use a constant index and a shared base pointer which is calculated using +/// raw pointer arithmetic. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "WebAssemblySubtarget.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "wasm-optimize-geps" +#define PASS_DESC "Optimize inner-loop GEPs for Webassembly" + +using namespace llvm; + +static cl::opt DisableWebAssemblyOptimizeGEPs( + "disable-wasm-optimize-geps", cl::Hidden, + cl::desc("WebAssembly: Disable getelementptr optimizations."), + cl::init(false)); + +namespace { + +using BaseOffsetPair = std::pair; + +class RebaseCandidate { +public: + RebaseCandidate(GetElementPtrInst *GEP, ConstantInt *ImmOffset) + : GEP(GEP), ImmOffset(ImmOffset) {} + + void update(Value *NewBasePtr) { + assert(NewBasePtr->getType()->isPointerTy() && "Expected PointerTy"); + GEP->setOperand(0, NewBasePtr); + GEP->setOperand(1, ImmOffset); + } + + GetElementPtrInst *getGEP() const { return GEP; } + +#ifndef NDEBUG + void dump() { + LLVM_DEBUG(dbgs() << "Candidate:\n" + << " ImmOffset: " << *ImmOffset << "\n" + << " Base: " << *GEP->getPointerOperand() << "\n" + << " GEP: " << *GEP << "\n"); + } +#endif + +private: + GetElementPtrInst *GEP; + ConstantInt *ImmOffset; +}; + +class WebAssemblyOptimizeGEPs : public LoopPass { +public: + static char ID; + + WebAssemblyOptimizeGEPs() : LoopPass(ID) {} + +private: + StringRef getPassName() const override { return PASS_DESC; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + bool run(Loop *L, const DominatorTree &DT); + void addCandidate(GetElementPtrInst *GEP, Value *RegOffset, + ConstantInt *ImmOffset); + // Create a new base address using pointer casting and add nuw. This should + // prevent LSR from messing with our inbound geps. + Instruction *createNewBaseAddr(BaseOffsetPair &BasePair, Loop *L, + const DominatorTree &DT); + + LLVMContext *Ctx = nullptr; + IntegerType *ArithTy = nullptr; + SmallVector BasePairs; + DenseMap> Candidates; +}; + +} // end anonymous namespace + +char WebAssemblyOptimizeGEPs::ID = 0; +INITIALIZE_PASS_BEGIN(WebAssemblyOptimizeGEPs, DEBUG_TYPE, PASS_DESC, false, + false) +INITIALIZE_PASS_END(WebAssemblyOptimizeGEPs, DEBUG_TYPE, PASS_DESC, false, + false) + +Pass *llvm::createWebAssemblyOptimizeGEPs() { + return new WebAssemblyOptimizeGEPs(); +} + +bool WebAssemblyOptimizeGEPs::runOnLoop(Loop *L, LPPassManager &) { + if (DisableWebAssemblyOptimizeGEPs) + return false; + if (skipLoop(L)) + return false; + if (!L->getLoopPreheader()) + return false; + + // LSR only operates on the inner most loops, so do the same. + if (!L->isInnermost()) { + LLVM_DEBUG(dbgs() << "WasmOptGep: Not inner most loop.\n"); + return false; + } + + if (L->getNumBlocks() > 1) { + LLVM_DEBUG(dbgs() << "WasmOptGep: Only handling single-block loops.\n"); + return false; + } + + Function *F = L->getLoopPreheader()->getParent(); + Ctx = &F->getContext(); + auto &DT = getAnalysis().getDomTree(); + const WebAssemblySubtarget &ST = getAnalysis() + .getTM() + .getSubtarget(*F); + // Set the type we'll use for pointer arithmetic. + ArithTy = ST.hasAddr64() ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx); + Candidates.clear(); + BasePairs.clear(); + return run(L, DT); +} + +void WebAssemblyOptimizeGEPs::addCandidate(GetElementPtrInst *GEP, + Value *RegOffset, + ConstantInt *ImmOffset) { + Value *BasePtr = GEP->getPointerOperand(); + BaseOffsetPair BasePair = std::make_pair(BasePtr, RegOffset); + if (!Candidates.count(BasePair)) + BasePairs.push_back(BasePair); + Candidates[BasePair].emplace_back(GEP, ImmOffset); +} + +Instruction * +WebAssemblyOptimizeGEPs::createNewBaseAddr(BaseOffsetPair &BasePair, Loop *L, + const DominatorTree &DT) { + Value *BasePtr = BasePair.first; + Value *Offset = BasePair.second; + Type *BaseType = BasePtr->getType(); + IRBuilder<> Builder(*Ctx); + + LLVM_DEBUG(dbgs() << "WasmOptGep: Creating new base addr.\n" + << " with base address: " << *BasePtr << "\n" + << " and reg offset: " << *Offset << "\n"); + + assert(BasePtr->getType()->isPointerTy() && "Expected PointerTy"); + assert(Offset->getType() == ArithTy && "Expected matching IntegerTy"); + + auto *PtrToInt = cast(Builder.CreatePtrToInt(BasePtr, ArithTy)); + auto *PtrArith = + cast(Builder.CreateAdd(PtrToInt, Offset, "", /*nuw*/ true)); + auto *NewBase = cast(Builder.CreateIntToPtr(PtrArith, BaseType)); + + // Choose an insertion point for the address calculation: + // - either in the preheader, + // - or just before the first gep. + if (L->isLoopInvariant(BasePtr) && L->isLoopInvariant(Offset)) { + NewBase->insertBefore(&L->getLoopPreheader()->back()); + } else { + SmallVectorImpl &Sorted = Candidates[BasePair]; + llvm::sort(Sorted, [&DT](RebaseCandidate &A, RebaseCandidate &B) { + return DT.dominates(A.getGEP(), B.getGEP()); + }); + const RebaseCandidate &FirstCandidate = Sorted.front(); + NewBase->insertBefore(FirstCandidate.getGEP()); + } + + PtrArith->insertBefore(NewBase); + PtrToInt->insertBefore(PtrArith); + + LLVM_DEBUG(dbgs() << " new base addr: " << *NewBase << "\n"); + return NewBase; +} + +bool WebAssemblyOptimizeGEPs::run(Loop *L, const DominatorTree &DT) { + // An inbound GetElementPtr with a single index. + auto IsValidGEP = [](GetElementPtrInst *GEP) { + return GEP && GEP->isInBounds() && GEP->getNumIndices() == 1; + }; + + // - An Or with a constant. + auto AddIfValidIndex = [this](GetElementPtrInst *GEP) { + if (auto *Index = dyn_cast(GEP->getOperand(1))) + if (Index->getOpcode() == Instruction::Or) + if (auto *ImmOffset = dyn_cast(Index->getOperand(1))) { + addCandidate(GEP, Index->getOperand(0), ImmOffset); + return true; + } + return false; + }; + + // Search the loop for all the GEPs and indices that meet our requirements, + // and also record any other valid GEPs to revisit. + assert(L->getNumBlocks() == 1 && "Expected single block loop"); + SmallVector ToRevisit; + for (auto &I : *L->getHeader()) { + auto *GEP = dyn_cast(&I); + if (IsValidGEP(GEP)) + if (!AddIfValidIndex(GEP)) + ToRevisit.push_back(GEP); + } + + // Look for any GEPs that are already accessing an address that we are + // going to regenerate during the rebase. + for (auto *GEP : ToRevisit) { + auto BasePair = std::make_pair(GEP->getOperand(0), GEP->getOperand(1)); + if (Candidates.count(BasePair)) + addCandidate(GEP, GEP->getOperand(1), ConstantInt::get(ArithTy, 0)); + } + + if (Candidates.empty()) + return false; + + LLVM_DEBUG(dbgs() << "WasmOptGep: Found rebase candidates:\n"; + for (auto BasePair + : BasePairs) for (auto &Candidate + : Candidates[BasePair]) Candidate.dump();); + + // Refactor the common base components into a new base address, updating each + // GEP to use it, as well as an immediate index. + for (auto &BasePair : BasePairs) { + if (Candidates[BasePair].size() < 2) + continue; + Instruction *NewBase = createNewBaseAddr(BasePair, L, DT); + for (auto &Candidate : Candidates[BasePair]) + Candidate.update(NewBase); + } + + return true; +} Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -80,6 +80,7 @@ initializeWebAssemblyDebugFixupPass(PR); initializeWebAssemblyPeepholePass(PR); initializeWebAssemblyMCLowerPrePassPass(PR); + initializeWebAssemblyOptimizeGEPsPass(PR); } //===----------------------------------------------------------------------===// @@ -450,6 +451,10 @@ // Expand indirectbr instructions to switches. addPass(createIndirectBrExpandPass()); + // Modify GetElementPtr before LoopStrengthReduce. + if (getOptLevel() != CodeGenOpt::None) + addPass(createWebAssemblyOptimizeGEPs()); + TargetPassConfig::addIRPasses(); } Index: llvm/test/CodeGen/WebAssembly/optimize-geps-wasm64.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/WebAssembly/optimize-geps-wasm64.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm64 -instcombine -wasm-optimize-geps -loop-reduce -S %s -o - | FileCheck %s +target datalayout = "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" + +define hidden void @one_dim(ptr nocapture noundef readonly %0, ptr nocapture noundef writeonly %1) { +; CHECK-LABEL: @one_dim( +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP1:%.*]] to i64 +; CHECK-NEXT: br label [[TMP6:%.*]] +; CHECK: 5: +; CHECK-NEXT: ret void +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP19:%.*]], [[TMP6]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 0 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 1 +; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19]] = add nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 10000 +; CHECK-NEXT: br i1 [[TMP20]], label [[TMP5:%.*]], label [[TMP6]] +; + br label %4 + +3: ; preds = %4 + ret void + +4: ; preds = %4, %2 + %5 = phi i64 [ 0, %2 ], [ %13, %4 ] + %6 = getelementptr inbounds i32, ptr %0, i64 %5 + %7 = load i32, ptr %6, align 4 + %8 = getelementptr inbounds i32, ptr %1, i64 %5 + store i32 %7, ptr %8, align 4 + %9 = or i64 %5, 1 + %10 = getelementptr inbounds i32, ptr %0, i64 %9 + %11 = load i32, ptr %10, align 4 + %12 = getelementptr inbounds i32, ptr %1, i64 %9 + store i32 %11, ptr %12, align 4 + %13 = add nuw nsw i64 %5, 2 + %14 = icmp eq i64 %13, 10000 + br i1 %14, label %3, label %4 +} Index: llvm/test/CodeGen/WebAssembly/optimize-geps.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/WebAssembly/optimize-geps.ll @@ -0,0 +1,285 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm32 -instcombine -wasm-optimize-geps -loop-reduce -S %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" + +define hidden void @one_dim(ptr nocapture noundef readonly %arg, ptr nocapture noundef writeonly %arg1) { +; CHECK-LABEL: @one_dim( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[ARG2:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; CHECK-NEXT: [[ARG11:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[ARG2]], [[I]] +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[I5:%.*]] = load i32, ptr [[I4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[ARG11]], [[I]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; CHECK-NEXT: [[I7:%.*]] = or i32 [[I]], 1 +; CHECK-NEXT: [[I8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[I8]], align 4 +; CHECK-NEXT: [[I10:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1 +; CHECK-NEXT: store i32 [[I9]], ptr [[I10]], align 4 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr inbounds i32, ptr %arg, i32 %i + %i5 = load i32, ptr %i4, align 4 + %i6 = getelementptr inbounds i32, ptr %arg1, i32 %i + store i32 %i5, ptr %i6, align 4 + %i7 = or i32 %i, 1 + %i8 = getelementptr inbounds i32, ptr %arg, i32 %i7 + %i9 = load i32, ptr %i8, align 4 + %i10 = getelementptr inbounds i32, ptr %arg1, i32 %i7 + store i32 %i9, ptr %i10, align 4 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} + +define hidden void @no_inbound_loads(ptr nocapture noundef readonly %arg, ptr nocapture noundef writeonly %arg1) { +; CHECK-LABEL: @no_inbound_loads( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[ARG11:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[UGLYGEP:%.*]], [[BB3]] ], [ [[ARG:%.*]], [[BB:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB]] ], [ [[I11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[I5:%.*]] = load i32, ptr [[LSR_IV]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[ARG11]], [[I]] +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store i32 [[I5]], ptr [[I6]], align 4 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[UGLYGEP2]], align 4 +; CHECK-NEXT: [[I10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +; CHECK-NEXT: store i32 [[I9]], ptr [[I10]], align 4 +; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I]], 2 +; CHECK-NEXT: [[UGLYGEP]] = getelementptr i8, ptr [[LSR_IV]], i32 8 +; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], 10000 +; CHECK-NEXT: br i1 [[I12]], label [[BB2:%.*]], label [[BB3]] +; +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i11, %bb3 ] + %i4 = getelementptr i32, ptr %arg, i32 %i + %i5 = load i32, ptr %i4, align 4 + %i6 = getelementptr inbounds i32, ptr %arg1, i32 %i + store i32 %i5, ptr %i6, align 4 + %i7 = or i32 %i, 1 + %i8 = getelementptr i32, ptr %arg, i32 %i7 + %i9 = load i32, ptr %i8, align 4 + %i10 = getelementptr inbounds i32, ptr %arg1, i32 %i7 + store i32 %i9, ptr %i10, align 4 + %i11 = add nuw nsw i32 %i, 2 + %i12 = icmp eq i32 %i11, 10000 + br i1 %i12, label %bb2, label %bb3 +} + +define hidden void @two_dims(ptr nocapture noundef readonly %arg, ptr nocapture noundef %arg1) local_unnamed_addr { +; CHECK-LABEL: @two_dims( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[I9:%.*]], [[BB8:%.*]] ] +; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i32 [[I]] +; CHECK-NEXT: [[I4:%.*]] = load ptr, ptr [[I3]], align 4 +; CHECK-NEXT: [[I41:%.*]] = ptrtoint ptr [[I4]] to i32 +; CHECK-NEXT: [[I5:%.*]] = getelementptr inbounds i32, ptr [[ARG1:%.*]], i32 [[I]] +; CHECK-NEXT: [[I6:%.*]] = load i32, ptr [[I5]], align 4 +; CHECK-NEXT: br label [[BB11:%.*]] +; CHECK: bb7: +; CHECK-NEXT: ret void +; CHECK: bb8: +; CHECK-NEXT: [[I9]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[I10:%.*]] = icmp eq i32 [[I9]], 10000 +; CHECK-NEXT: br i1 [[I10]], label [[BB7:%.*]], label [[BB2]] +; CHECK: bb11: +; CHECK-NEXT: [[I12:%.*]] = phi i32 [ [[I6]], [[BB2]] ], [ [[I20:%.*]], [[BB11]] ] +; CHECK-NEXT: [[I13:%.*]] = phi i32 [ 0, [[BB2]] ], [ [[I33:%.*]], [[BB11]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[I41]], [[I13]] +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; CHECK-NEXT: [[I14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[I14]], align 4 +; CHECK-NEXT: [[I16:%.*]] = add nsw i32 [[I12]], [[I15]] +; CHECK-NEXT: store i32 [[I16]], ptr [[I5]], align 4 +; CHECK-NEXT: [[I17:%.*]] = or i32 [[I13]], 1 +; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +; CHECK-NEXT: [[I19:%.*]] = load i32, ptr [[I18]], align 4 +; CHECK-NEXT: [[I20]] = add nsw i32 [[I16]], [[I19]] +; CHECK-NEXT: store i32 [[I20]], ptr [[I5]], align 4 +; CHECK-NEXT: [[I33]] = add nuw nsw i32 [[I13]], 2 +; CHECK-NEXT: [[I34:%.*]] = icmp eq i32 [[I33]], 10000 +; CHECK-NEXT: br i1 [[I34]], label [[BB8]], label [[BB11]] +; +bb: + br label %bb2 + +bb2: ; preds = %bb8, %bb + %i = phi i32 [ 0, %bb ], [ %i9, %bb8 ] + %i3 = getelementptr inbounds ptr, ptr %arg, i32 %i + %i4 = load ptr, ptr %i3, align 4 + %i5 = getelementptr inbounds i32, ptr %arg1, i32 %i + %i6 = load i32, ptr %i5, align 4 + br label %bb11 + +bb7: ; preds = %bb8 + ret void + +bb8: ; preds = %bb11 + %i9 = add nuw nsw i32 %i, 1 + %i10 = icmp eq i32 %i9, 10000 + br i1 %i10, label %bb7, label %bb2 + +bb11: ; preds = %bb11, %bb2 + %i12 = phi i32 [ %i6, %bb2 ], [ %i20, %bb11 ] + %i13 = phi i32 [ 0, %bb2 ], [ %i33, %bb11 ] + %i14 = getelementptr inbounds i32, ptr %i4, i32 %i13 + %i15 = load i32, ptr %i14, align 4 + %i16 = add nsw i32 %i12, %i15 + store i32 %i16, ptr %i5, align 4 + %i17 = add nuw nsw i32 %i13, 1 + %i18 = getelementptr inbounds i32, ptr %i4, i32 %i17 + %i19 = load i32, ptr %i18, align 4 + %i20 = add nsw i32 %i16, %i19 + store i32 %i20, ptr %i5, align 4 + %i33 = add nuw nsw i32 %i13, 2 + %i34 = icmp eq i32 %i33, 10000 + br i1 %i34, label %bb8, label %bb11 +} + +define hidden void @runtime(ptr nocapture noundef readonly %arg, ptr nocapture noundef readonly %arg1, ptr nocapture noundef writeonly %arg2, i32 noundef %arg3) { +; CHECK-LABEL: @runtime( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[ARG4:%.*]] = ptrtoint ptr [[ARG:%.*]] to i32 +; CHECK-NEXT: [[ARG12:%.*]] = ptrtoint ptr [[ARG1:%.*]] to i32 +; CHECK-NEXT: [[ARG21:%.*]] = ptrtoint ptr [[ARG2:%.*]] to i32 +; CHECK-NEXT: [[I:%.*]] = icmp eq i32 [[ARG3:%.*]], 0 +; CHECK-NEXT: br i1 [[I]], label [[BB19:%.*]], label [[BB4:%.*]] +; CHECK: bb4: +; CHECK-NEXT: [[I5:%.*]] = and i32 [[ARG3]], 1 +; CHECK-NEXT: [[I6:%.*]] = icmp eq i32 [[ARG3]], 1 +; CHECK-NEXT: br i1 [[I6]], label [[BB9:%.*]], label [[BB7:%.*]] +; CHECK: bb7: +; CHECK-NEXT: [[I8:%.*]] = and i32 [[ARG3]], -2 +; CHECK-NEXT: br label [[BB20:%.*]] +; CHECK: bb9.loopexit: +; CHECK-NEXT: br label [[BB9]] +; CHECK: bb9: +; CHECK-NEXT: [[I10:%.*]] = phi i32 [ 0, [[BB4]] ], [ [[I36:%.*]], [[BB9_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[I11:%.*]] = icmp eq i32 [[I5]], 0 +; CHECK-NEXT: br i1 [[I11]], label [[BB19]], label [[BB12:%.*]] +; CHECK: bb12: +; CHECK-NEXT: [[I13:%.*]] = getelementptr inbounds float, ptr [[ARG]], i32 [[I10]] +; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[I13]], align 4 +; CHECK-NEXT: [[I15:%.*]] = getelementptr inbounds float, ptr [[ARG1]], i32 [[I10]] +; CHECK-NEXT: [[I16:%.*]] = load float, ptr [[I15]], align 4 +; CHECK-NEXT: [[I17:%.*]] = fadd float [[I14]], [[I16]] +; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds float, ptr [[ARG2]], i32 [[I10]] +; CHECK-NEXT: store float [[I17]], ptr [[I18]], align 4 +; CHECK-NEXT: br label [[BB19]] +; CHECK: bb19: +; CHECK-NEXT: ret void +; CHECK: bb20: +; CHECK-NEXT: [[I21:%.*]] = phi i32 [ 0, [[BB7]] ], [ [[I36]], [[BB20]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[ARG4]], [[I21]] +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to ptr +; CHECK-NEXT: [[I23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[I24:%.*]] = load float, ptr [[I23]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[ARG12]], [[I21]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr +; CHECK-NEXT: [[I25:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[I26:%.*]] = load float, ptr [[I25]], align 4 +; CHECK-NEXT: [[I27:%.*]] = fadd float [[I24]], [[I26]] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[ARG21]], [[I21]] +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr +; CHECK-NEXT: [[I28:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store float [[I27]], ptr [[I28]], align 4 +; CHECK-NEXT: [[I29:%.*]] = or i32 [[I21]], 1 +; CHECK-NEXT: [[I30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +; CHECK-NEXT: [[I31:%.*]] = load float, ptr [[I30]], align 4 +; CHECK-NEXT: [[I32:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1 +; CHECK-NEXT: [[I33:%.*]] = load float, ptr [[I32]], align 4 +; CHECK-NEXT: [[I34:%.*]] = fadd float [[I31]], [[I33]] +; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 1 +; CHECK-NEXT: store float [[I34]], ptr [[I35]], align 4 +; CHECK-NEXT: [[I36]] = add i32 [[I21]], 2 +; CHECK-NEXT: [[I38:%.*]] = icmp eq i32 [[I8]], [[I36]] +; CHECK-NEXT: br i1 [[I38]], label [[BB9_LOOPEXIT]], label [[BB20]] +; +bb: + %i = icmp eq i32 %arg3, 0 + br i1 %i, label %bb19, label %bb4 + +bb4: ; preds = %bb + %i5 = and i32 %arg3, 1 + %i6 = icmp eq i32 %arg3, 1 + br i1 %i6, label %bb9, label %bb7 + +bb7: ; preds = %bb4 + %i8 = and i32 %arg3, -2 + br label %bb20 + +bb9: ; preds = %bb20, %bb4 + %i10 = phi i32 [ 0, %bb4 ], [ %i36, %bb20 ] + %i11 = icmp eq i32 %i5, 0 + br i1 %i11, label %bb19, label %bb12 + +bb12: ; preds = %bb9 + %i13 = getelementptr inbounds float, ptr %arg, i32 %i10 + %i14 = load float, ptr %i13, align 4 + %i15 = getelementptr inbounds float, ptr %arg1, i32 %i10 + %i16 = load float, ptr %i15, align 4 + %i17 = fadd float %i14, %i16 + %i18 = getelementptr inbounds float, ptr %arg2, i32 %i10 + store float %i17, ptr %i18, align 4 + br label %bb19 + +bb19: ; preds = %bb12, %bb9, %bb + ret void + +bb20: ; preds = %bb20, %bb7 + %i21 = phi i32 [ 0, %bb7 ], [ %i36, %bb20 ] + %i22 = phi i32 [ 0, %bb7 ], [ %i37, %bb20 ] + %i23 = getelementptr inbounds float, ptr %arg, i32 %i21 + %i24 = load float, ptr %i23, align 4 + %i25 = getelementptr inbounds float, ptr %arg1, i32 %i21 + %i26 = load float, ptr %i25, align 4 + %i27 = fadd float %i24, %i26 + %i28 = getelementptr inbounds float, ptr %arg2, i32 %i21 + store float %i27, ptr %i28, align 4 + %i29 = or i32 %i21, 1 + %i30 = getelementptr inbounds float, ptr %arg, i32 %i29 + %i31 = load float, ptr %i30, align 4 + %i32 = getelementptr inbounds float, ptr %arg1, i32 %i29 + %i33 = load float, ptr %i32, align 4 + %i34 = fadd float %i31, %i33 + %i35 = getelementptr inbounds float, ptr %arg2, i32 %i29 + store float %i34, ptr %i35, align 4 + %i36 = add nuw i32 %i21, 2 + %i37 = add i32 %i22, 2 + %i38 = icmp eq i32 %i37, %i8 + br i1 %i38, label %bb9, label %bb20 +} Index: llvm/test/CodeGen/WebAssembly/unrolled-mem-indices.ll =================================================================== --- llvm/test/CodeGen/WebAssembly/unrolled-mem-indices.ll +++ llvm/test/CodeGen/WebAssembly/unrolled-mem-indices.ll @@ -6,50 +6,43 @@ define hidden void @one_dim(ptr nocapture noundef readonly %arg, ptr nocapture noundef readonly %arg1, ptr nocapture noundef writeonly %arg2) { ; CHECK-LABEL: one_dim: ; CHECK: .functype one_dim (i32, i32, i32) -> () -; CHECK-NEXT: .local i32, i32, i32 +; CHECK-NEXT: .local i32, i32, i32, i32 ; CHECK-NEXT: # %bb.0: # %bb -; CHECK-NEXT: i32.const $push22=, 0 -; CHECK-NEXT: local.set 3, $pop22 +; CHECK-NEXT: i32.const $push17=, 0 +; CHECK-NEXT: local.set 3, $pop17 ; CHECK-NEXT: .LBB0_1: # %bb4 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label0: -; CHECK-NEXT: local.get $push27=, 2 -; CHECK-NEXT: local.get $push24=, 1 -; CHECK-NEXT: local.get $push23=, 3 -; CHECK-NEXT: i32.add $push21=, $pop24, $pop23 -; CHECK-NEXT: local.tee $push20=, 4, $pop21 -; CHECK-NEXT: i32.load16_s $push1=, 0($pop20) -; CHECK-NEXT: local.get $push26=, 0 -; CHECK-NEXT: local.get $push25=, 3 -; CHECK-NEXT: i32.add $push19=, $pop26, $pop25 -; CHECK-NEXT: local.tee $push18=, 5, $pop19 -; CHECK-NEXT: i32.load16_s $push0=, 0($pop18) +; CHECK-NEXT: local.get $push19=, 2 +; CHECK-NEXT: local.get $push18=, 3 +; CHECK-NEXT: i32.add $push16=, $pop19, $pop18 +; CHECK-NEXT: local.tee $push15=, 4, $pop16 +; CHECK-NEXT: local.get $push21=, 1 +; CHECK-NEXT: local.get $push20=, 3 +; CHECK-NEXT: i32.add $push14=, $pop21, $pop20 +; CHECK-NEXT: local.tee $push13=, 5, $pop14 +; CHECK-NEXT: i32.load16_s $push1=, 0($pop13) +; CHECK-NEXT: local.get $push23=, 0 +; CHECK-NEXT: local.get $push22=, 3 +; CHECK-NEXT: i32.add $push12=, $pop23, $pop22 +; CHECK-NEXT: local.tee $push11=, 6, $pop12 +; CHECK-NEXT: i32.load16_s $push0=, 0($pop11) ; CHECK-NEXT: i32.add $push2=, $pop1, $pop0 -; CHECK-NEXT: i32.store 0($pop27), $pop2 -; CHECK-NEXT: local.get $push28=, 2 -; CHECK-NEXT: i32.const $push17=, 4 -; CHECK-NEXT: i32.add $push8=, $pop28, $pop17 -; CHECK-NEXT: local.get $push29=, 4 -; CHECK-NEXT: i32.const $push16=, 2 -; CHECK-NEXT: i32.add $push5=, $pop29, $pop16 -; CHECK-NEXT: i32.load16_s $push6=, 0($pop5) -; CHECK-NEXT: local.get $push30=, 5 -; CHECK-NEXT: i32.const $push15=, 2 -; CHECK-NEXT: i32.add $push3=, $pop30, $pop15 -; CHECK-NEXT: i32.load16_s $push4=, 0($pop3) -; CHECK-NEXT: i32.add $push7=, $pop6, $pop4 -; CHECK-NEXT: i32.store 0($pop8), $pop7 -; CHECK-NEXT: local.get $push32=, 2 -; CHECK-NEXT: i32.const $push14=, 8 -; CHECK-NEXT: i32.add $push31=, $pop32, $pop14 -; CHECK-NEXT: local.set 2, $pop31 -; CHECK-NEXT: local.get $push33=, 3 -; CHECK-NEXT: i32.const $push13=, 4 -; CHECK-NEXT: i32.add $push12=, $pop33, $pop13 -; CHECK-NEXT: local.tee $push11=, 3, $pop12 -; CHECK-NEXT: i32.const $push10=, 20000 -; CHECK-NEXT: i32.ne $push9=, $pop11, $pop10 -; CHECK-NEXT: br_if 0, $pop9 # 0: up to label0 +; CHECK-NEXT: i32.store 0($pop15), $pop2 +; CHECK-NEXT: local.get $push26=, 4 +; CHECK-NEXT: local.get $push24=, 5 +; CHECK-NEXT: i32.load16_s $push4=, 2($pop24) +; CHECK-NEXT: local.get $push25=, 6 +; CHECK-NEXT: i32.load16_s $push3=, 2($pop25) +; CHECK-NEXT: i32.add $push5=, $pop4, $pop3 +; CHECK-NEXT: i32.store 4($pop26), $pop5 +; CHECK-NEXT: local.get $push27=, 3 +; CHECK-NEXT: i32.const $push10=, 2 +; CHECK-NEXT: i32.add $push9=, $pop27, $pop10 +; CHECK-NEXT: local.tee $push8=, 3, $pop9 +; CHECK-NEXT: i32.const $push7=, 10000 +; CHECK-NEXT: i32.ne $push6=, $pop8, $pop7 +; CHECK-NEXT: br_if 0, $pop6 # 0: up to label0 ; CHECK-NEXT: # %bb.2: # %bb3 ; CHECK-NEXT: end_loop ; CHECK-NEXT: # fallthrough-return @@ -90,48 +83,46 @@ ; CHECK: .functype one_dim_no_inbound_loads (i32, i32, i32) -> () ; CHECK-NEXT: .local i32, i32, i32 ; CHECK-NEXT: # %bb.0: # %bb -; CHECK-NEXT: i32.const $push22=, 0 -; CHECK-NEXT: local.set 3, $pop22 +; CHECK-NEXT: i32.const $push20=, 0 +; CHECK-NEXT: local.set 3, $pop20 ; CHECK-NEXT: .LBB1_1: # %bb4 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label1: -; CHECK-NEXT: local.get $push27=, 2 -; CHECK-NEXT: local.get $push24=, 1 +; CHECK-NEXT: local.get $push25=, 2 +; CHECK-NEXT: local.get $push22=, 1 +; CHECK-NEXT: local.get $push21=, 3 +; CHECK-NEXT: i32.add $push19=, $pop22, $pop21 +; CHECK-NEXT: local.tee $push18=, 4, $pop19 +; CHECK-NEXT: i32.load16_s $push1=, 0($pop18) +; CHECK-NEXT: local.get $push24=, 0 ; CHECK-NEXT: local.get $push23=, 3 -; CHECK-NEXT: i32.add $push21=, $pop24, $pop23 -; CHECK-NEXT: local.tee $push20=, 4, $pop21 -; CHECK-NEXT: i32.load16_s $push1=, 0($pop20) -; CHECK-NEXT: local.get $push26=, 0 -; CHECK-NEXT: local.get $push25=, 3 -; CHECK-NEXT: i32.add $push19=, $pop26, $pop25 -; CHECK-NEXT: local.tee $push18=, 5, $pop19 -; CHECK-NEXT: i32.load16_s $push0=, 0($pop18) +; CHECK-NEXT: i32.add $push17=, $pop24, $pop23 +; CHECK-NEXT: local.tee $push16=, 5, $pop17 +; CHECK-NEXT: i32.load16_s $push0=, 0($pop16) ; CHECK-NEXT: i32.add $push2=, $pop1, $pop0 -; CHECK-NEXT: i32.store 0($pop27), $pop2 +; CHECK-NEXT: i32.store 0($pop25), $pop2 ; CHECK-NEXT: local.get $push28=, 2 -; CHECK-NEXT: i32.const $push17=, 4 -; CHECK-NEXT: i32.add $push8=, $pop28, $pop17 -; CHECK-NEXT: local.get $push29=, 4 -; CHECK-NEXT: i32.const $push16=, 2 -; CHECK-NEXT: i32.add $push5=, $pop29, $pop16 -; CHECK-NEXT: i32.load16_s $push6=, 0($pop5) -; CHECK-NEXT: local.get $push30=, 5 +; CHECK-NEXT: local.get $push26=, 4 ; CHECK-NEXT: i32.const $push15=, 2 -; CHECK-NEXT: i32.add $push3=, $pop30, $pop15 +; CHECK-NEXT: i32.add $push5=, $pop26, $pop15 +; CHECK-NEXT: i32.load16_s $push6=, 0($pop5) +; CHECK-NEXT: local.get $push27=, 5 +; CHECK-NEXT: i32.const $push14=, 2 +; CHECK-NEXT: i32.add $push3=, $pop27, $pop14 ; CHECK-NEXT: i32.load16_s $push4=, 0($pop3) ; CHECK-NEXT: i32.add $push7=, $pop6, $pop4 -; CHECK-NEXT: i32.store 0($pop8), $pop7 -; CHECK-NEXT: local.get $push32=, 2 -; CHECK-NEXT: i32.const $push14=, 8 -; CHECK-NEXT: i32.add $push31=, $pop32, $pop14 -; CHECK-NEXT: local.set 2, $pop31 -; CHECK-NEXT: local.get $push33=, 3 -; CHECK-NEXT: i32.const $push13=, 4 -; CHECK-NEXT: i32.add $push12=, $pop33, $pop13 -; CHECK-NEXT: local.tee $push11=, 3, $pop12 -; CHECK-NEXT: i32.const $push10=, 20000 -; CHECK-NEXT: i32.ne $push9=, $pop11, $pop10 -; CHECK-NEXT: br_if 0, $pop9 # 0: up to label1 +; CHECK-NEXT: i32.store 4($pop28), $pop7 +; CHECK-NEXT: local.get $push30=, 2 +; CHECK-NEXT: i32.const $push13=, 2 +; CHECK-NEXT: i32.add $push29=, $pop30, $pop13 +; CHECK-NEXT: local.set 2, $pop29 +; CHECK-NEXT: local.get $push31=, 3 +; CHECK-NEXT: i32.const $push12=, 4 +; CHECK-NEXT: i32.add $push11=, $pop31, $pop12 +; CHECK-NEXT: local.tee $push10=, 3, $pop11 +; CHECK-NEXT: i32.const $push9=, 20000 +; CHECK-NEXT: i32.ne $push8=, $pop10, $pop9 +; CHECK-NEXT: br_if 0, $pop8 # 0: up to label1 ; CHECK-NEXT: # %bb.2: # %bb3 ; CHECK-NEXT: end_loop ; CHECK-NEXT: # fallthrough-return @@ -172,101 +163,89 @@ ; CHECK: .functype two_dims (i32, i32, i32) -> () ; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32, i32, i32 ; CHECK-NEXT: # %bb.0: # %bb -; CHECK-NEXT: i32.const $push48=, 0 -; CHECK-NEXT: local.set 3, $pop48 +; CHECK-NEXT: i32.const $push36=, 0 +; CHECK-NEXT: local.set 3, $pop36 ; CHECK-NEXT: .LBB2_1: # %bb3 ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB2_2 Depth 2 ; CHECK-NEXT: loop # label2: -; CHECK-NEXT: local.get $push50=, 2 -; CHECK-NEXT: local.get $push49=, 3 -; CHECK-NEXT: i32.const $push29=, 2 -; CHECK-NEXT: i32.shl $push28=, $pop49, $pop29 -; CHECK-NEXT: local.tee $push27=, 4, $pop28 -; CHECK-NEXT: i32.add $push26=, $pop50, $pop27 -; CHECK-NEXT: local.tee $push25=, 5, $pop26 -; CHECK-NEXT: i32.load $push51=, 0($pop25) -; CHECK-NEXT: local.set 6, $pop51 -; CHECK-NEXT: local.get $push53=, 1 -; CHECK-NEXT: local.get $push52=, 4 -; CHECK-NEXT: i32.add $push0=, $pop53, $pop52 -; CHECK-NEXT: i32.load $push54=, 0($pop0) -; CHECK-NEXT: local.set 7, $pop54 -; CHECK-NEXT: local.get $push56=, 0 -; CHECK-NEXT: local.get $push55=, 4 -; CHECK-NEXT: i32.add $push1=, $pop56, $pop55 -; CHECK-NEXT: i32.load $push57=, 0($pop1) -; CHECK-NEXT: local.set 8, $pop57 -; CHECK-NEXT: i32.const $push58=, 0 -; CHECK-NEXT: local.set 4, $pop58 +; CHECK-NEXT: local.get $push38=, 2 +; CHECK-NEXT: local.get $push37=, 3 +; CHECK-NEXT: i32.const $push23=, 2 +; CHECK-NEXT: i32.shl $push22=, $pop37, $pop23 +; CHECK-NEXT: local.tee $push21=, 4, $pop22 +; CHECK-NEXT: i32.add $push20=, $pop38, $pop21 +; CHECK-NEXT: local.tee $push19=, 5, $pop20 +; CHECK-NEXT: i32.load $push39=, 0($pop19) +; CHECK-NEXT: local.set 6, $pop39 +; CHECK-NEXT: local.get $push41=, 1 +; CHECK-NEXT: local.get $push40=, 4 +; CHECK-NEXT: i32.add $push0=, $pop41, $pop40 +; CHECK-NEXT: i32.load $push42=, 0($pop0) +; CHECK-NEXT: local.set 7, $pop42 +; CHECK-NEXT: local.get $push44=, 0 +; CHECK-NEXT: local.get $push43=, 4 +; CHECK-NEXT: i32.add $push1=, $pop44, $pop43 +; CHECK-NEXT: i32.load $push45=, 0($pop1) +; CHECK-NEXT: local.set 8, $pop45 +; CHECK-NEXT: i32.const $push46=, 0 +; CHECK-NEXT: local.set 4, $pop46 ; CHECK-NEXT: .LBB2_2: # %bb14 ; CHECK-NEXT: # Parent Loop BB2_1 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: loop # label3: -; CHECK-NEXT: local.get $push60=, 7 -; CHECK-NEXT: local.get $push59=, 4 -; CHECK-NEXT: i32.add $push43=, $pop60, $pop59 -; CHECK-NEXT: local.tee $push42=, 9, $pop43 -; CHECK-NEXT: i32.const $push41=, 6 -; CHECK-NEXT: i32.add $push20=, $pop42, $pop41 -; CHECK-NEXT: i32.load16_s $push21=, 0($pop20) -; CHECK-NEXT: local.get $push62=, 8 -; CHECK-NEXT: local.get $push61=, 4 -; CHECK-NEXT: i32.add $push40=, $pop62, $pop61 -; CHECK-NEXT: local.tee $push39=, 10, $pop40 -; CHECK-NEXT: i32.const $push38=, 6 -; CHECK-NEXT: i32.add $push18=, $pop39, $pop38 -; CHECK-NEXT: i32.load16_s $push19=, 0($pop18) -; CHECK-NEXT: i32.add $push22=, $pop21, $pop19 -; CHECK-NEXT: local.get $push63=, 9 -; CHECK-NEXT: i32.const $push37=, 4 -; CHECK-NEXT: i32.add $push14=, $pop63, $pop37 -; CHECK-NEXT: i32.load16_s $push15=, 0($pop14) -; CHECK-NEXT: local.get $push64=, 10 -; CHECK-NEXT: i32.const $push36=, 4 -; CHECK-NEXT: i32.add $push12=, $pop64, $pop36 -; CHECK-NEXT: i32.load16_s $push13=, 0($pop12) -; CHECK-NEXT: i32.add $push16=, $pop15, $pop13 -; CHECK-NEXT: local.get $push65=, 9 -; CHECK-NEXT: i32.const $push35=, 2 -; CHECK-NEXT: i32.add $push8=, $pop65, $pop35 -; CHECK-NEXT: i32.load16_s $push9=, 0($pop8) -; CHECK-NEXT: local.get $push66=, 10 -; CHECK-NEXT: i32.const $push34=, 2 -; CHECK-NEXT: i32.add $push6=, $pop66, $pop34 -; CHECK-NEXT: i32.load16_s $push7=, 0($pop6) -; CHECK-NEXT: i32.add $push10=, $pop9, $pop7 -; CHECK-NEXT: local.get $push67=, 9 -; CHECK-NEXT: i32.load16_s $push3=, 0($pop67) -; CHECK-NEXT: local.get $push68=, 10 -; CHECK-NEXT: i32.load16_s $push2=, 0($pop68) +; CHECK-NEXT: local.get $push48=, 7 +; CHECK-NEXT: local.get $push47=, 4 +; CHECK-NEXT: i32.add $push31=, $pop48, $pop47 +; CHECK-NEXT: local.tee $push30=, 9, $pop31 +; CHECK-NEXT: i32.load16_s $push15=, 6($pop30) +; CHECK-NEXT: local.get $push50=, 8 +; CHECK-NEXT: local.get $push49=, 4 +; CHECK-NEXT: i32.add $push29=, $pop50, $pop49 +; CHECK-NEXT: local.tee $push28=, 10, $pop29 +; CHECK-NEXT: i32.load16_s $push14=, 6($pop28) +; CHECK-NEXT: i32.add $push16=, $pop15, $pop14 +; CHECK-NEXT: local.get $push51=, 9 +; CHECK-NEXT: i32.load16_s $push11=, 4($pop51) +; CHECK-NEXT: local.get $push52=, 10 +; CHECK-NEXT: i32.load16_s $push10=, 4($pop52) +; CHECK-NEXT: i32.add $push12=, $pop11, $pop10 +; CHECK-NEXT: local.get $push53=, 9 +; CHECK-NEXT: i32.load16_s $push3=, 2($pop53) +; CHECK-NEXT: local.get $push54=, 10 +; CHECK-NEXT: i32.load16_s $push2=, 2($pop54) ; CHECK-NEXT: i32.add $push4=, $pop3, $pop2 -; CHECK-NEXT: local.get $push69=, 6 -; CHECK-NEXT: i32.add $push5=, $pop4, $pop69 -; CHECK-NEXT: i32.add $push11=, $pop10, $pop5 -; CHECK-NEXT: i32.add $push17=, $pop16, $pop11 -; CHECK-NEXT: i32.add $push70=, $pop22, $pop17 -; CHECK-NEXT: local.set 6, $pop70 -; CHECK-NEXT: local.get $push71=, 4 -; CHECK-NEXT: i32.const $push33=, 8 -; CHECK-NEXT: i32.add $push32=, $pop71, $pop33 -; CHECK-NEXT: local.tee $push31=, 4, $pop32 -; CHECK-NEXT: i32.const $push30=, 20000 -; CHECK-NEXT: i32.ne $push23=, $pop31, $pop30 -; CHECK-NEXT: br_if 0, $pop23 # 0: up to label3 +; CHECK-NEXT: local.get $push55=, 9 +; CHECK-NEXT: i32.load16_s $push6=, 0($pop55) +; CHECK-NEXT: local.get $push56=, 10 +; CHECK-NEXT: i32.load16_s $push5=, 0($pop56) +; CHECK-NEXT: i32.add $push7=, $pop6, $pop5 +; CHECK-NEXT: local.get $push57=, 6 +; CHECK-NEXT: i32.add $push8=, $pop7, $pop57 +; CHECK-NEXT: i32.add $push9=, $pop4, $pop8 +; CHECK-NEXT: i32.add $push13=, $pop12, $pop9 +; CHECK-NEXT: i32.add $push58=, $pop16, $pop13 +; CHECK-NEXT: local.set 6, $pop58 +; CHECK-NEXT: local.get $push59=, 4 +; CHECK-NEXT: i32.const $push27=, 4 +; CHECK-NEXT: i32.add $push26=, $pop59, $pop27 +; CHECK-NEXT: local.tee $push25=, 4, $pop26 +; CHECK-NEXT: i32.const $push24=, 10000 +; CHECK-NEXT: i32.ne $push17=, $pop25, $pop24 +; CHECK-NEXT: br_if 0, $pop17 # 0: up to label3 ; CHECK-NEXT: # %bb.3: # %bb11 ; CHECK-NEXT: # in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: end_loop -; CHECK-NEXT: local.get $push73=, 5 -; CHECK-NEXT: local.get $push72=, 6 -; CHECK-NEXT: i32.store 0($pop73), $pop72 -; CHECK-NEXT: local.get $push74=, 3 -; CHECK-NEXT: i32.const $push47=, 1 -; CHECK-NEXT: i32.add $push46=, $pop74, $pop47 -; CHECK-NEXT: local.tee $push45=, 3, $pop46 -; CHECK-NEXT: i32.const $push44=, 10000 -; CHECK-NEXT: i32.ne $push24=, $pop45, $pop44 -; CHECK-NEXT: br_if 0, $pop24 # 0: up to label2 +; CHECK-NEXT: local.get $push61=, 5 +; CHECK-NEXT: local.get $push60=, 6 +; CHECK-NEXT: i32.store 0($pop61), $pop60 +; CHECK-NEXT: local.get $push62=, 3 +; CHECK-NEXT: i32.const $push35=, 1 +; CHECK-NEXT: i32.add $push34=, $pop62, $pop35 +; CHECK-NEXT: local.tee $push33=, 3, $pop34 +; CHECK-NEXT: i32.const $push32=, 10000 +; CHECK-NEXT: i32.ne $push18=, $pop33, $pop32 +; CHECK-NEXT: br_if 0, $pop18 # 0: up to label2 ; CHECK-NEXT: # %bb.4: # %bb10 ; CHECK-NEXT: end_loop ; CHECK-NEXT: # fallthrough-return @@ -341,99 +320,84 @@ ; CHECK-NEXT: .local i32, i32, i32, i32, i32 ; CHECK-NEXT: # %bb.0: # %bb ; CHECK-NEXT: block -; CHECK-NEXT: local.get $push32=, 3 -; CHECK-NEXT: i32.eqz $push64=, $pop32 -; CHECK-NEXT: br_if 0, $pop64 # 0: down to label4 +; CHECK-NEXT: local.get $push29=, 3 +; CHECK-NEXT: i32.eqz $push55=, $pop29 +; CHECK-NEXT: br_if 0, $pop55 # 0: down to label4 ; CHECK-NEXT: # %bb.1: # %bb4 -; CHECK-NEXT: local.get $push34=, 3 +; CHECK-NEXT: local.get $push31=, 3 ; CHECK-NEXT: i32.const $push0=, 1 -; CHECK-NEXT: i32.and $push33=, $pop34, $pop0 -; CHECK-NEXT: local.set 4, $pop33 -; CHECK-NEXT: i32.const $push35=, 0 -; CHECK-NEXT: local.set 5, $pop35 +; CHECK-NEXT: i32.and $push30=, $pop31, $pop0 +; CHECK-NEXT: local.set 4, $pop30 +; CHECK-NEXT: i32.const $push32=, 0 +; CHECK-NEXT: local.set 5, $pop32 ; CHECK-NEXT: block -; CHECK-NEXT: local.get $push36=, 3 -; CHECK-NEXT: i32.const $push20=, 1 -; CHECK-NEXT: i32.eq $push1=, $pop36, $pop20 +; CHECK-NEXT: local.get $push33=, 3 +; CHECK-NEXT: i32.const $push17=, 1 +; CHECK-NEXT: i32.eq $push1=, $pop33, $pop17 ; CHECK-NEXT: br_if 0, $pop1 # 0: down to label5 ; CHECK-NEXT: # %bb.2: # %bb7 -; CHECK-NEXT: local.get $push38=, 3 +; CHECK-NEXT: local.get $push35=, 3 ; CHECK-NEXT: i32.const $push2=, -2 -; CHECK-NEXT: i32.and $push37=, $pop38, $pop2 -; CHECK-NEXT: local.set 6, $pop37 -; CHECK-NEXT: i32.const $push39=, 0 -; CHECK-NEXT: local.set 5, $pop39 -; CHECK-NEXT: local.get $push40=, 0 -; CHECK-NEXT: local.set 3, $pop40 -; CHECK-NEXT: local.get $push41=, 1 -; CHECK-NEXT: local.set 7, $pop41 -; CHECK-NEXT: local.get $push42=, 2 -; CHECK-NEXT: local.set 8, $pop42 +; CHECK-NEXT: i32.and $push34=, $pop35, $pop2 +; CHECK-NEXT: local.set 6, $pop34 +; CHECK-NEXT: i32.const $push36=, 0 +; CHECK-NEXT: local.set 5, $pop36 ; CHECK-NEXT: .LBB3_3: # %bb20 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label6: -; CHECK-NEXT: local.get $push45=, 8 -; CHECK-NEXT: local.get $push43=, 3 -; CHECK-NEXT: f32.load $push4=, 0($pop43) -; CHECK-NEXT: local.get $push44=, 7 -; CHECK-NEXT: f32.load $push3=, 0($pop44) +; CHECK-NEXT: local.get $push38=, 2 +; CHECK-NEXT: local.get $push37=, 5 +; CHECK-NEXT: i32.add $push26=, $pop38, $pop37 +; CHECK-NEXT: local.tee $push25=, 3, $pop26 +; CHECK-NEXT: local.get $push40=, 0 +; CHECK-NEXT: local.get $push39=, 5 +; CHECK-NEXT: i32.add $push24=, $pop40, $pop39 +; CHECK-NEXT: local.tee $push23=, 7, $pop24 +; CHECK-NEXT: f32.load $push4=, 0($pop23) +; CHECK-NEXT: local.get $push42=, 1 +; CHECK-NEXT: local.get $push41=, 5 +; CHECK-NEXT: i32.add $push22=, $pop42, $pop41 +; CHECK-NEXT: local.tee $push21=, 8, $pop22 +; CHECK-NEXT: f32.load $push3=, 0($pop21) ; CHECK-NEXT: f32.add $push5=, $pop4, $pop3 -; CHECK-NEXT: f32.store 0($pop45), $pop5 -; CHECK-NEXT: local.get $push46=, 8 -; CHECK-NEXT: i32.const $push29=, 4 -; CHECK-NEXT: i32.add $push11=, $pop46, $pop29 -; CHECK-NEXT: local.get $push47=, 3 -; CHECK-NEXT: i32.const $push28=, 4 -; CHECK-NEXT: i32.add $push8=, $pop47, $pop28 -; CHECK-NEXT: f32.load $push9=, 0($pop8) -; CHECK-NEXT: local.get $push48=, 7 -; CHECK-NEXT: i32.const $push27=, 4 -; CHECK-NEXT: i32.add $push6=, $pop48, $pop27 -; CHECK-NEXT: f32.load $push7=, 0($pop6) -; CHECK-NEXT: f32.add $push10=, $pop9, $pop7 -; CHECK-NEXT: f32.store 0($pop11), $pop10 -; CHECK-NEXT: local.get $push50=, 3 -; CHECK-NEXT: i32.const $push26=, 8 -; CHECK-NEXT: i32.add $push49=, $pop50, $pop26 -; CHECK-NEXT: local.set 3, $pop49 -; CHECK-NEXT: local.get $push52=, 7 -; CHECK-NEXT: i32.const $push25=, 8 -; CHECK-NEXT: i32.add $push51=, $pop52, $pop25 -; CHECK-NEXT: local.set 7, $pop51 -; CHECK-NEXT: local.get $push54=, 8 -; CHECK-NEXT: i32.const $push24=, 8 -; CHECK-NEXT: i32.add $push53=, $pop54, $pop24 -; CHECK-NEXT: local.set 8, $pop53 -; CHECK-NEXT: local.get $push56=, 6 -; CHECK-NEXT: local.get $push55=, 5 -; CHECK-NEXT: i32.const $push23=, 2 -; CHECK-NEXT: i32.add $push22=, $pop55, $pop23 -; CHECK-NEXT: local.tee $push21=, 5, $pop22 -; CHECK-NEXT: i32.ne $push12=, $pop56, $pop21 -; CHECK-NEXT: br_if 0, $pop12 # 0: up to label6 +; CHECK-NEXT: f32.store 0($pop25), $pop5 +; CHECK-NEXT: local.get $push45=, 3 +; CHECK-NEXT: local.get $push43=, 7 +; CHECK-NEXT: f32.load $push7=, 4($pop43) +; CHECK-NEXT: local.get $push44=, 8 +; CHECK-NEXT: f32.load $push6=, 4($pop44) +; CHECK-NEXT: f32.add $push8=, $pop7, $pop6 +; CHECK-NEXT: f32.store 4($pop45), $pop8 +; CHECK-NEXT: local.get $push47=, 6 +; CHECK-NEXT: local.get $push46=, 5 +; CHECK-NEXT: i32.const $push20=, 2 +; CHECK-NEXT: i32.add $push19=, $pop46, $pop20 +; CHECK-NEXT: local.tee $push18=, 5, $pop19 +; CHECK-NEXT: i32.ne $push9=, $pop47, $pop18 +; CHECK-NEXT: br_if 0, $pop9 # 0: up to label6 ; CHECK-NEXT: .LBB3_4: # %bb9 ; CHECK-NEXT: end_loop ; CHECK-NEXT: end_block # label5: -; CHECK-NEXT: local.get $push57=, 4 -; CHECK-NEXT: i32.eqz $push65=, $pop57 -; CHECK-NEXT: br_if 0, $pop65 # 0: down to label4 +; CHECK-NEXT: local.get $push48=, 4 +; CHECK-NEXT: i32.eqz $push56=, $pop48 +; CHECK-NEXT: br_if 0, $pop56 # 0: down to label4 ; CHECK-NEXT: # %bb.5: # %bb12 -; CHECK-NEXT: local.get $push59=, 2 -; CHECK-NEXT: local.get $push58=, 5 -; CHECK-NEXT: i32.const $push13=, 2 -; CHECK-NEXT: i32.shl $push31=, $pop58, $pop13 -; CHECK-NEXT: local.tee $push30=, 3, $pop31 -; CHECK-NEXT: i32.add $push19=, $pop59, $pop30 -; CHECK-NEXT: local.get $push61=, 0 -; CHECK-NEXT: local.get $push60=, 3 -; CHECK-NEXT: i32.add $push16=, $pop61, $pop60 -; CHECK-NEXT: f32.load $push17=, 0($pop16) -; CHECK-NEXT: local.get $push63=, 1 -; CHECK-NEXT: local.get $push62=, 3 -; CHECK-NEXT: i32.add $push14=, $pop63, $pop62 -; CHECK-NEXT: f32.load $push15=, 0($pop14) -; CHECK-NEXT: f32.add $push18=, $pop17, $pop15 -; CHECK-NEXT: f32.store 0($pop19), $pop18 +; CHECK-NEXT: local.get $push50=, 2 +; CHECK-NEXT: local.get $push49=, 5 +; CHECK-NEXT: i32.const $push10=, 2 +; CHECK-NEXT: i32.shl $push28=, $pop49, $pop10 +; CHECK-NEXT: local.tee $push27=, 5, $pop28 +; CHECK-NEXT: i32.add $push16=, $pop50, $pop27 +; CHECK-NEXT: local.get $push52=, 0 +; CHECK-NEXT: local.get $push51=, 5 +; CHECK-NEXT: i32.add $push13=, $pop52, $pop51 +; CHECK-NEXT: f32.load $push14=, 0($pop13) +; CHECK-NEXT: local.get $push54=, 1 +; CHECK-NEXT: local.get $push53=, 5 +; CHECK-NEXT: i32.add $push11=, $pop54, $pop53 +; CHECK-NEXT: f32.load $push12=, 0($pop11) +; CHECK-NEXT: f32.add $push15=, $pop14, $pop12 +; CHECK-NEXT: f32.store 0($pop16), $pop15 ; CHECK-NEXT: .LBB3_6: # %bb19 ; CHECK-NEXT: end_block # label4: ; CHECK-NEXT: # fallthrough-return Index: llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn =================================================================== --- llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn +++ llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn @@ -57,6 +57,7 @@ "WebAssemblyMachineFunctionInfo.cpp", "WebAssemblyMemIntrinsicResults.cpp", "WebAssemblyNullifyDebugValueLists.cpp", + "WebAssemblyOptimizeGEPs.cpp", "WebAssemblyOptimizeLiveIntervals.cpp", "WebAssemblyOptimizeReturned.cpp", "WebAssemblyPeephole.cpp",