diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -407,6 +407,8 @@ return true; } + bool isInitialized() const { return Array != nullptr; } + static const unsigned BasePtrsArgNum = 2; static const unsigned PtrsArgNum = 3; static const unsigned SizesArgNum = 4; @@ -465,6 +467,160 @@ } }; +/// Used to store the instructions that serve as setup ("issue") for the +/// offload arrays that the runtime call __tgt_target_data_begin_mapper has. +/// __tgt_target_data_begin(..., +/// i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, +/// ...) +struct MemoryTransfer { + /// Call to __tgt_target_data_begin_mapper. + CallInst *RuntimeCall = nullptr; + /// The values stored in %offload_baseptrs, %offload_ptrs, and %offload_sizes + /// (before RuntimeCall is issued) are mapped here, in the first, second, and + /// third position, respectively. + ArrayRef OffloadArrays; + /// Used to access OffloadArrays. + static const unsigned BasePtrsIdx = 0; + static const unsigned PtrsIdx = 1; + static const unsigned SizesIdx = 2; + /// Set of instructions that compose the argument setup for RuntimeCall. + SetVector Issue; + + /// Factory function that initializes the MemoryTrasfer by getting the + /// instructions that compose the arguments setup ("issue") of RuntimeCall. + /// This should be used instead of the constructor. + bool initialize(CallInst &RuntimeCall, ArrayRef OffloadArrays) { + assert(OffloadArrays.size() == 3 && + OffloadArrays[BasePtrsIdx].isInitialized() && + OffloadArrays[PtrsIdx].isInitialized() && + "No offload arrays to look at!"); + + if (!detectIssue(RuntimeCall, OffloadArrays)) + return false; + + this->RuntimeCall = &RuntimeCall; + this->OffloadArrays = OffloadArrays; + return true; + } + + MemoryTransfer() = default; + MemoryTransfer(const MemoryTransfer &) = delete; + MemoryTransfer &operator=(const MemoryTransfer &) = delete; + +private: + /// Groups into Issue the instructions that compose the argument setup for + /// RuntimeCall. + bool detectIssue(CallInst &RuntimeCall, + ArrayRef OffloadArrays) { + + // Get setup instruction for %offload_baseptrs. + if (!getSetupInstructions(OffloadArrays[BasePtrsIdx])) + return false; + + auto *BasePtrsGEP = cast( + RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum)); + Issue.insert(BasePtrsGEP); + + // Get setup instruction for %offload_ptrs. + if (!getSetupInstructions(OffloadArrays[PtrsIdx])) + return false; + auto *PtrsGEP = + cast(RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum)); + Issue.insert(PtrsGEP); + + // Get setup instruction for %offload_sizes. + // %offload_sizes may be a constant global array, so no need to analyze it. + if (!OffloadArrays[SizesIdx].isInitialized()) + return true; + if (!getSetupInstructions(OffloadArrays[SizesIdx])) + return false; + + auto *SizesGEP = cast( + RuntimeCall.getArgOperand(OffloadArray::SizesArgNum)); + Issue.insert(SizesGEP); + + return true; + } + + /// Gets the setup instructions for each of the values in \p OA. These + /// instructions are stored into Issue. + bool getSetupInstructions(const OffloadArray &OA) { + for (auto *S : OA.LastAccesses) { + if (!getValueSetupInstructions(*S)) + return false; + + if (!getPointerSetupInstructions(*S)) + return false; + + Issue.insert(S); + } + return true; + } + + /// Gets the setup instructions for the value operand of \p S. + bool getValueSetupInstructions(StoreInst &S) { + auto *V = S.getValueOperand(); + + // Auxiliary storage for later popping out the found instructions in the + // needed order. + const unsigned MaxLookup = 6; + SmallVector TempStorage; + bool Success = false; + for (unsigned I = 0; I < MaxLookup; ++I) { + if (isa(V) || isa(V) || isa(V) || + isa(V)) { + Success = true; + break; + } + + if (!isa(V)) { + Success = false; + break; + } + + auto *Inst = cast(V); + TempStorage.push_back(Inst); + + // FIXME: Inst might depend on more instructions through its second operand. + V = Inst->getOperand(0); + } + + if (Success) + while (!TempStorage.empty()) + Issue.insert(TempStorage.pop_back_val()); + + return Success; + } + + /// Gets the setup instructions for the pointer operand of \p S. + bool getPointerSetupInstructions(StoreInst &S) { + auto *P = S.getPointerOperand(); + + // TODO: P might be a global value. Make it general. + if (!isa(P)) + return false; + + auto *DstInst = cast(P); + if (isa(DstInst)) { + Issue.insert(DstInst); + + } else if (DstInst->isCast()) { + auto *Casted = DstInst->getOperand(0); + + // TODO: Casted might be a global value. Make it general. + if (!isa(Casted)) + return false; + + if (auto *GEP = dyn_cast(Casted)) + Issue.insert(GEP); + + Issue.insert(DstInst); + } + + return true; + } +}; + struct OpenMPOpt { using OptimizationRemarkGetter = @@ -678,9 +834,13 @@ OffloadArray OffloadArrays[3]; if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) return false; - LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); + MemoryTransfer MT; + if (!MT.initialize(*RTCall, OffloadArrays)) + return false; + LLVM_DEBUG(dumpMemoryTransferSetupInstructions(MT)); + // TODO: Check if can be moved upwards. bool WasSplit = false; Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); @@ -734,6 +894,7 @@ // Get values stored in **offload_sizes. V = getUnderlyingObject(SizesArg); + // If it's a [constant] global array don't analyze it. if (isa(V)) return isa(V); @@ -780,6 +941,19 @@ LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); } + void dumpMemoryTransferSetupInstructions(MemoryTransfer &MT) { + LLVM_DEBUG(dbgs() << TAG << " Successfully got set up instructions for: " + << MT.RuntimeCall->getCaller()->getName() << "\n"); + std::string ValuesStr; + raw_string_ostream Printer(ValuesStr); + + for (auto *I : MT.Issue) { + I->print(Printer); + Printer << "\n"; + } + LLVM_DEBUG(dbgs() << Printer.str()); + } + /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be /// moved. Returns nullptr if the movement is not possible, or not worth it. Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { diff --git a/llvm/test/Transforms/OpenMP/runtime_call_setup_instructions.ll b/llvm/test/Transforms/OpenMP/runtime_call_setup_instructions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/runtime_call_setup_instructions.ll @@ -0,0 +1,117 @@ +; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency -debug-only=openmp-opt < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +@.offload_maptypes. = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.__omp_offloading_heavyComputation.region_id = weak constant i8 0 +@.offload_sizes. = private unnamed_addr constant [2 x i64] [i64 4, i64 0] +@.offload_maptypes.2 = private unnamed_addr constant [2 x i64] [i64 800, i64 544] + +; CHECK-LABEL: {{[^@]+}}Successfully got set up instructions for: heavyComputation +; CHECK-NEXT: %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** +; CHECK-NEXT: store double* %a, double** %2, align 8 +; CHECK-NEXT: %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 +; CHECK-NEXT: %7 = bitcast i8** %6 to i32** +; CHECK-NEXT: store i32* %size.addr, i32** %7, align 8 +; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 +; CHECK-NEXT: %4 = bitcast [2 x i8*]* %.offload_ptrs to double** +; CHECK-NEXT: store double* %a, double** %4, align 8 +; CHECK-NEXT: %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 +; CHECK-NEXT: %9 = bitcast i8** %8 to i32** +; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8 +; CHECK-NEXT: %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 +; CHECK-NEXT: %conv = zext i32 %size to i64 +; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3 +; CHECK-NEXT: %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 +; CHECK-NEXT: store i64 %0, i64* %5, align 8 +; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 +; CHECK-NEXT: store i64 4, i64* %10, align 8 + +;int heavyComputation(double* restrict a, unsigned size) { +; int random = rand() % 7; +; +; //#pragma omp target data map(a[0:size], size) +; void* args[2]; +; args[0] = &a; +; args[1] = &size; +; __tgt_target_data_begin(..., args, ...) +; +; #pragma omp target teams +; for (int i = 0; i < size; ++i) { +; a[i] = ++a[i] * 3.141624; +; } +; +; return random; +;} +define dso_local i32 @heavyComputation(double* noalias %a, i32 %size) { +entry: + %size.addr = alloca i32, align 4 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + %.offload_sizes = alloca [2 x i64], align 8 + %.offload_baseptrs2 = alloca [2 x i8*], align 8 + %.offload_ptrs3 = alloca [2 x i8*], align 8 + store i32 %size, i32* %size.addr, align 4 + + %call = tail call i32 (...) @rand() + + %conv = zext i32 %size to i64 + %0 = shl nuw nsw i64 %conv, 3 + %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 + %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** + store double* %a, double** %2, align 8 + %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 + %4 = bitcast [2 x i8*]* %.offload_ptrs to double** + store double* %a, double** %4, align 8 + %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 + store i64 %0, i64* %5, align 8 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 + %7 = bitcast i8** %6 to i32** + store i32* %size.addr, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 + %9 = bitcast i8** %8 to i32** + store i32* %size.addr, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 + store i64 4, i64* %10, align 8 + call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null) + + %11 = load i32, i32* %size.addr, align 4 + %size.casted = zext i32 %11 to i64 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 + %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64* + store i64 %size.casted, i64* %13, align 8 + %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0 + %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64* + store i64 %size.casted, i64* %15, align 8 + %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1 + %17 = bitcast i8** %16 to double** + store double* %a, double** %17, align 8 + %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 + %19 = bitcast i8** %18 to double** + store double* %a, double** %19, align 8 + + %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes., i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i32 0, i32 0) + %.not = icmp eq i32 %20, 0 + br i1 %.not, label %omp_offload.cont, label %omp_offload.failed + +omp_offload.failed: ; preds = %entry + call void @heavyComputationFallBack(i64 %size.casted, double* %a) + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + %rem = srem i32 %call, 7 + call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null) + ret i32 %rem +} + +define internal void @heavyComputationFallBack(i64 %size, double* %a) { +entry: + ; Fallback for offloading function heavyComputation3. + ret void +} + +declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) +declare i32 @__tgt_target_teams_mapper(i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i32, i32) +declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) + +declare dso_local i32 @rand(...)