Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -26,6 +26,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include "llvm/Analysis/ValueTracking.h" using namespace llvm; using namespace omp; @@ -442,6 +443,87 @@ SmallPtrSetImpl &Kernels; }; +/// Used to map the values physically (in the IR) stored in an offload +/// array, to a vector in memory. +struct OffloadArray { + /// Physical array (in the IR). + AllocaInst *Array; + /// Mapped values. + SmallVector StoredValues; + /// Last stores made in the offload array. + SmallVector LastAccesses; + + OffloadArray() = default; + + /// Initializes the OffloadArray with the values stored in \p Array before + /// instruction \p Before is reached. Returns false if the initialization + /// fails. + /// This MUST be used immediately after construction the object. + bool initialize(AllocaInst &Array, Instruction &Before) { + if (!Array.getAllocatedType()->isArrayTy()) + return false; + + this->Array = &Array; + if (!getValues(Before)) + return false; + + return true; + } + +private: + /// Traverses the BasicBlock where Array is, collecting the stores made to + /// Array, leaving StoredValues with the values stored before the instruction + /// \p Before is reached. + bool getValues(Instruction &Before) { + // Initialize container. + const uint64_t NumValues = + Array->getAllocatedType()->getArrayNumElements(); + StoredValues.assign(NumValues, nullptr); + LastAccesses.assign(NumValues, nullptr); + + // TODO: This assumes the instruction \p Before is in the same + // BasicBlock as Array. Make it general, for any control flow graph. + BasicBlock *BB = Array->getParent(); + if (BB != Before.getParent()) + return false; + + const DataLayout &DL = Array->getModule()->getDataLayout(); + const unsigned int PointerSize = DL.getPointerSize(); + + for (Instruction &I : *BB) { + if (&I == &Before) + break; + + if (!isa(&I)) + continue; + + auto *S = cast(&I); + int64_t Offset = -1; + auto *Dst = GetPointerBaseWithConstantOffset(S->getPointerOperand(), + Offset, DL); + if (Dst == Array) { + int64_t Idx = Offset / PointerSize; + StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); + LastAccesses[Idx] = S; + } + } + + return isFilled(); + } + + /// Returns true if all values in StoredValues and + /// LastAccesses are not nullptrs. + bool isFilled() { + const unsigned NumValues = StoredValues.size(); + for (unsigned I = 0; I < NumValues; ++I) { + if (!StoredValues[I] || !LastAccesses[I]) + return false; + } + + return true; + } +}; + struct OpenMPOpt { using OptimizationRemarkGetter = @@ -652,6 +734,14 @@ if (!RTCall) return false; + OffloadArray OffloadArrays[3]; + if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) + return false; + +#ifndef NDEBUG + debugValuesInOffloadArrays(OffloadArrays); +#endif + // TODO: Check if can be moved upwards. bool WasSplit = false; Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); @@ -666,6 +756,93 @@ return Changed; } + /// Maps the values stored in the offload arrays passed as arguments to + /// \p RuntimeCall into the offload arrays in \p OAs. + bool getValuesInOffloadArrays(CallInst &RuntimeCall, + MutableArrayRef OAs) { + assert(OAs.size() == 3 && "Need space for three offload arrays!"); + + // A runtime call that involves memory offloading looks something like: + // call void @__tgt_target_data_begin_mapper(arg0, arg1, + // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, + // ...) + // So, the idea is to access the allocas that allocate space for these + // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. + // Therefore: + // i8** %offload_baseptrs. + const unsigned BasePtrsArgNum = 2; + Value *BasePtrsArg = RuntimeCall.getArgOperand(BasePtrsArgNum); + // i8** %offload_ptrs. + const unsigned PtrsArgNum = 3; + Value *PtrsArg = RuntimeCall.getArgOperand(PtrsArgNum); + // i8** %offload_sizes. + const unsigned SizesArgNum = 4; + Value *SizesArg = RuntimeCall.getArgOperand(SizesArgNum); + + // Get values stored in **offload_baseptrs. + auto *V = getUnderlyingObject(BasePtrsArg); + if (!isa(V)) + return false; + auto *BasePtrsArray = cast(V); + if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall)) + return false; + + // Get values stored in **offload_baseptrs. + V = getUnderlyingObject(PtrsArg); + if (!isa(V)) + return false; + auto *PtrsArray = cast(V); + if (!OAs[1].initialize(*PtrsArray, RuntimeCall)) + return false; + + // Get values stored in **offload_sizes. + V = getUnderlyingObject(SizesArg); + // If it's a global array don't analyze it. + if (isa(V)) + return true; + if (!isa(V)) + return false; + + auto *SizesArray = cast(V); + if (!OAs[2].initialize(*SizesArray, RuntimeCall)) + return false; + + return true; + } + + /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG. + /// For now this is a way to test that the function getValuesInOffloadArrays + /// is working properly. + /// TODO: Move this to a unittest when unittests are available for OpenMPOpt. + void debugValuesInOffloadArrays(ArrayRef OAs) { + assert(OAs.size() == 3 && "There are three offload arrays to debug!"); + + LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n"); + std::string ValuesStr; + raw_string_ostream Printer(ValuesStr); + std::string Separator = " --- "; + + for (auto *BP : OAs[0].StoredValues) { + BP->print(Printer); + Printer << Separator; + } + LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n"); + ValuesStr.clear(); + + for (auto *P : OAs[1].StoredValues) { + P->print(Printer); + Printer << Separator; + } + LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n"); + ValuesStr.clear(); + + for (auto *S : OAs[2].StoredValues) { + S->print(Printer); + Printer << Separator; + } + LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); + } + /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be /// moved. Returns nullptr if the movement is not possible, or not worth it. Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { Index: llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll @@ -0,0 +1,67 @@ +; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency -debug-only=openmp-opt < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +@.__omp_offloading_heavyComputation.region_id = weak constant i8 0 +@.offload_maptypes. = private unnamed_addr constant [2 x i64] [i64 35, i64 35] + +; CHECK-LABEL: {{[^@]+}}Successfully got offload values: +; CHECK-NEXT: offload_baseptrs: double* %a --- %size.addr = alloca i32, align 4 --- +; CHECK-NEXT: offload_ptrs: double* %a --- %size.addr = alloca i32, align 4 --- +; CHECK-NEXT: offload_sizes: %0 = shl nuw nsw i64 %conv, 3 --- i64 4 --- + +;int heavyComputation(double* a, unsigned size) { +; int random = rand() % 7; +; +; //#pragma omp target data map(a[0:size], size) +; void* args[2]; +; args[0] = &a; +; args[1] = &size; +; __tgt_target_data_begin(..., args, ...) +; +; #pragma omp target teams +; for (int i = 0; i < size; ++i) { +; a[i] = ++a[i] * 3.141624; +; } +; +; return random; +;} +define dso_local i32 @heavyComputation(double* %a, i32 %size) { +entry: + %size.addr = alloca i32, align 4 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + %.offload_sizes = alloca [2 x i64], align 8 + + store i32 %size, i32* %size.addr, align 4 + %call = tail call i32 (...) @rand() + + %conv = zext i32 %size to i64 + %0 = shl nuw nsw i64 %conv, 3 + %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 + %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** + store double* %a, double** %2, align 8 + %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 + %4 = bitcast [2 x i8*]* %.offload_ptrs to double** + store double* %a, double** %4, align 8 + %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 + store i64 %0, i64* %5, align 8 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 + %7 = bitcast i8** %6 to i32** + store i32* %size.addr, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 + %9 = bitcast i8** %8 to i32** + store i32* %size.addr, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 + store i64 4, i64* %10, align 8 + call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null) + %rem = srem i32 %call, 7 + call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null) + ret i32 %rem +} + +declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) +declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) + +declare dso_local i32 @rand(...)