Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -26,6 +26,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include "llvm/Analysis/ValueTracking.h" using namespace llvm; using namespace omp; @@ -442,6 +443,126 @@ SmallPtrSetImpl &Kernels; }; +/// Used to map the values physically (in the IR) stored in an offload +/// array, to a vector in memory. +struct OffloadArray { + /// Physical array (in the IR). + AllocaInst &Array; + /// Mapped values. + SmallVector StoredValues; + /// Last stores made in the offload array. + SmallVector LastAccesses; + + OffloadArray(AllocaInst &Array) : Array(Array) {} + + /// Factory function for creating and initializing the OffloadArray with + /// the values stored in \p Array before the instruction \p Before is + /// reached. + /// This MUST be used instead of the constructor. + static std::unique_ptr initialize( + AllocaInst &Array, Instruction &Before) { + if (!Array.getAllocatedType()->isArrayTy()) + return nullptr; + + auto OA = std::make_unique(Array); + if (!OA->getValues(Before)) + return nullptr; + + return OA; + } + + /// Traverses the BasicBlock where Array is, collecting the stores made to + /// Array, leaving StoredValues with the values stored before the instruction + /// \p Before is reached. + bool getValues(Instruction &Before) { + // Initialize container. + const uint64_t NumValues = + Array.getAllocatedType()->getArrayNumElements(); + StoredValues.assign(NumValues, nullptr); + LastAccesses.assign(NumValues, nullptr); + + // TODO: This assumes the instruction \p Before is in the same + // BasicBlock as Array. Make it general, for any control flow graph. + BasicBlock *BB = Array.getParent(); + if (BB != Before.getParent()) + return false; + + for (Instruction &I : *BB) { + if (&I == &Before) + break; + + if (!isa(&I)) + continue; + + auto *S = cast(&I); + auto *Dst = getUnderlyingObject(S->getPointerOperand()); + if (Dst == &Array) { + int64_t Idx = getAccessedIdx(*S); + // Unexpected StoreInst. + if (Idx < 0) + return false; + + StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); + LastAccesses[Idx] = S; + } + } + + return isFilled(); + } + + /// Returns the Array's index where the store is being made + /// Returns -1 if the index can't be deduced. Assumes \p S as a store + /// to Array. + int64_t getAccessedIdx(StoreInst &S) { + auto *Dst = S.getOperand(1); + // Unrecognized store pattern. + if (!isa(Dst)) + return -1; + + auto *DstInst = cast(Dst); + Value *Access = DstInst; + if (DstInst->isCast()) { + Access = DstInst->getOperand(0); + + // Direct cast from the AllocaInst, which means a store to the + // first position of the array. + if (Access == &Array) + return 0; + } + + // Unrecognized store pattern. + if (!isa(Access)) + return -1; + + auto *GEPInst = cast(Access); + // Unrecognized store pattern. + if (!GEPInst->hasIndices()) + return -1; + + auto *ArrayIdx = GEPInst->idx_begin() + 1; + // Unrecognized store pattern. + if (ArrayIdx == GEPInst->idx_end()) + return -1; + + cast(ArrayIdx->get())->getValue(); + return cast(ArrayIdx->get())->getZExtValue(); + } + + /// Returns true if all values in StoredValues and + /// LastAccesses are not nullptrs. + bool isFilled() { + const unsigned NumValues = StoredValues.size(); + for (unsigned I = 0; I < NumValues; ++I) { + if (!StoredValues[I] || !LastAccesses[I]) + return false; + } + + return true; + } +}; + +using OffloadArrayPtr = std::unique_ptr; + struct OpenMPOpt { using OptimizationRemarkGetter = @@ -652,6 +773,11 @@ if (!RTCall) return false; + OffloadArrayPtr OffloadArrays[3]; + if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) + return false; + debugValuesInOffloadArrays(OffloadArrays); + // TODO: Check if can be moved upwards. bool WasSplit = false; Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); @@ -666,6 +792,94 @@ return Changed; } + /// Maps the values stored in the offload arrays passed as arguments to + /// \p RuntimeCall into the offload arrays in \p OAs. + bool getValuesInOffloadArrays(CallInst &RuntimeCall, + MutableArrayRef OAs) { + assert(OAs.size() == 3 && "Need space for three offload arrays!"); + + // A runtime call that involves memory offloading looks something like: + // call void @__tgt_target_data_begin_mapper(arg0, arg1, + // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, + // ...) + // So, the idea is to access the allocas that allocate space for these + // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. + // Therefore: + // i8** %offload_baseptrs. + const unsigned BasePtrsArgNum = 2; + Value *BasePtrsArg = RuntimeCall.getArgOperand(BasePtrsArgNum); + // i8** %offload_ptrs. + const unsigned PtrsArgNum = 3; + Value *PtrsArg = RuntimeCall.getArgOperand(PtrsArgNum); + // i8** %offload_sizes. + const unsigned SizesArgNum = 4; + Value *SizesArg = RuntimeCall.getArgOperand(SizesArgNum); + + // Get values stored in **offload_baseptrs. + auto *V = getUnderlyingObject(BasePtrsArg); + if (!isa(V)) + return false; + auto *BasePtrsArray = cast(V); + OAs[0] = OffloadArray::initialize(*BasePtrsArray, RuntimeCall); + if (!OAs[0]) + return false; + + // Get values stored in **offload_baseptrs. + V = getUnderlyingObject(PtrsArg); + if (!isa(V)) + return false; + auto *PtrsArray = cast(V); + OAs[1] = OffloadArray::initialize(*PtrsArray, RuntimeCall); + if (!OAs[1]) + return false; + + // Get values stored in **offload_baseptrs. + V = getUnderlyingObject(SizesArg); + if (!isa(V)) { + if (!isa(V)) + return false; + + auto *SizesArray = cast(V); + OAs[2] = OffloadArray::initialize(*SizesArray, RuntimeCall); + if (!OAs[2]) + return false; + } + + return true; + } + + /// Prints the values in the OffloadArray \p OAs using LLVM_DEBUG. + void debugValuesInOffloadArrays(ArrayRef OAs) { + assert(OAs.size() == 3 && "There are three offload arrays to debug!"); + + LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n"); + std::string ValuesStr; + raw_string_ostream Printer(ValuesStr); + std::string Separator = " --- "; + + for (auto *BP : OAs[0]->StoredValues) { + BP->print(Printer); + Printer << Separator; + } + LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n"); + ValuesStr.clear(); + + for (auto *P : OAs[1]->StoredValues) { + P->print(Printer); + Printer << Separator; + } + LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n"); + ValuesStr.clear(); + + if (OAs[2]) { + for (auto *S : OAs[2]->StoredValues) { + S->print(Printer); + Printer << Separator; + } + LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); + } + } + /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be /// moved. Returns nullptr if the movement is not possible, or not worth it. Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { Index: llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll @@ -0,0 +1,67 @@ +; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency -debug-only=openmp-opt < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +@.__omp_offloading_heavyComputation.region_id = weak constant i8 0 +@.offload_maptypes. = private unnamed_addr constant [2 x i64] [i64 35, i64 35] + +; CHECK-LABEL: {{[^@]+}}Successfully got offload values: +; CHECK-NEXT: offload_baseptrs: double* %a --- %size.addr = alloca i32, align 4 --- +; CHECK-NEXT: offload_ptrs: double* %a --- %size.addr = alloca i32, align 4 --- +; CHECK-NEXT: offload_sizes: %0 = shl nuw nsw i64 %conv, 3 --- i64 4 --- + +;int heavyComputation(double* a, unsigned size) { +; int random = rand() % 7; +; +; //#pragma omp target data map(a[0:size], size) +; void* args[2]; +; args[0] = &a; +; args[1] = &size; +; __tgt_target_data_begin(..., args, ...) +; +; #pragma omp target teams +; for (int i = 0; i < size; ++i) { +; a[i] = ++a[i] * 3.141624; +; } +; +; return random; +;} +define dso_local i32 @heavyComputation(double* %a, i32 %size) { +entry: + %size.addr = alloca i32, align 4 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + %.offload_sizes = alloca [2 x i64], align 8 + + store i32 %size, i32* %size.addr, align 4 + %call = tail call i32 (...) @rand() + + %conv = zext i32 %size to i64 + %0 = shl nuw nsw i64 %conv, 3 + %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 + %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** + store double* %a, double** %2, align 8 + %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 + %4 = bitcast [2 x i8*]* %.offload_ptrs to double** + store double* %a, double** %4, align 8 + %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 + store i64 %0, i64* %5, align 8 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 + %7 = bitcast i8** %6 to i32** + store i32* %size.addr, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 + %9 = bitcast i8** %8 to i32** + store i32* %size.addr, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 + store i64 4, i64* %10, align 8 + call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null) + %rem = srem i32 %call, 7 + call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null) + ret i32 %rem +} + +declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) +declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) + +declare dso_local i32 @rand(...)