diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/FunctionPropertiesAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" @@ -517,8 +518,7 @@ function_ref; OpenMPOpt(SmallVectorImpl &SCC, CallGraphUpdater &CGUpdater, - OptimizationRemarkGetter OREGetter, - OMPInformationCache &OMPInfoCache, Attributor &A) + OptimizationRemarkGetter OREGetter, OMPInformationCache &OMPInfoCache, Attributor &A) : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} @@ -567,6 +567,8 @@ Changed = true; } } + + Changed |= injectKernelFeatures(); } return Changed; @@ -1470,6 +1472,49 @@ return Changed; } + /// Inject the results of FunctionPropertiesAnalysis for each of the kernels + bool injectKernelFeatures() { + bool IsChanged = false; + for (Function *F : SCC) { + if (!OMPInfoCache.Kernels.count(F)) + continue; + + FunctionPropertiesInfo* FunctionInfo = OMPInfoCache.getAnalysisResultForFunction(*F); + if (FunctionInfo) { + // OMPInfoCache has returned the analysis result -- pack the feature values into the [* x i64] array + // and inject them as the constant global variable with the name '$KernelName.KernelFeatures' + IntegerType *FeatureType = Type::getInt64Ty(M.getContext()); + ArrayRef FeatureArrayRef{ + ConstantInt::get(FeatureType, FunctionInfo->BasicBlockCount), + ConstantInt::get(FeatureType, + FunctionInfo->BlocksReachedFromConditionalInstruction), + ConstantInt::get(FeatureType, + FunctionInfo->DirectCallsToDefinedFunctions), + ConstantInt::get(FeatureType, FunctionInfo->LoadInstCount), + ConstantInt::get(FeatureType, FunctionInfo->MaxLoopDepth), + ConstantInt::get(FeatureType, FunctionInfo->StoreInstCount), + ConstantInt::get(FeatureType, FunctionInfo->TopLevelLoopCount), + ConstantInt::get(FeatureType, FunctionInfo->Uses), + }; + ArrayType *FeatureArrayType = + ArrayType::get(FeatureType, FeatureArrayRef.size()); + Constant *FeatureArray = + ConstantArray::get(FeatureArrayType, FeatureArrayRef); + GlobalVariable *FeatureVector = new GlobalVariable( + M, FeatureArrayType, true, GlobalValue::ExternalLinkage, + FeatureArray, F->getName() + ".KernelFeatures"); + IsChanged = true; + LLVM_DEBUG(FeatureArray->print(dbgs() << TAG << " Function info for kernel " << F->getName() << " is: ")); + LLVM_DEBUG(dbgs() << "\n"); + } + else { + // OMPInfoCache has not returned the analysis result -- notify the debug output and proceed to the next kernel + LLVM_DEBUG(dbgs() << TAG << " No valid function info received for kernel " << F->getName() << "\n"); + } + } + return IsChanged; + } + /// Collect arguments that represent the global thread id in \p GTIdArgs. void collectGlobalThreadIdArguments(SmallSetVector >IdArgs) { // TODO: Below we basically perform a fixpoint iteration with a pessimistic diff --git a/llvm/test/Transforms/OpenMP/gpu_kernel_features.ll b/llvm/test/Transforms/OpenMP/gpu_kernel_features.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/gpu_kernel_features.ll @@ -0,0 +1,150 @@ +; RUN: opt -passes=openmp-opt-cgscc -S < %s | FileCheck %s + +; CHECK-DAG: @empty_kernel1.KernelFeatures = constant [8 x i64] [i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1] +; CHECK-DAG: @empty_kernel2.KernelFeatures = constant [8 x i64] [i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1] +; CHECK-DAG: @matmul_kernel.KernelFeatures = constant [8 x i64] [i64 1, i64 0, i64 1, i64 0, i64 0, i64 1, i64 0, i64 1] +; CHECK-DAG: @multiply_kernel.KernelFeatures = constant [8 x i64] [i64 13, i64 6, i64 0, i64 21, i64 3, i64 11, i64 1, i64 2] + +define void @empty_kernel1() { + ret void +} + +define void @empty_kernel2() { + ret void +} + +define void @non_kernel() { + ret void +} + + +define i32 @matmul_kernel() { +entry: + %retval = alloca i32, align 4 + %mat1 = alloca [2 x [2 x i32]], align 16 + %mat2 = alloca [2 x [2 x i32]], align 16 + %res = alloca [2 x [2 x i32]], align 16 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %arraydecay = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat1, i64 0, i64 0 + %arraydecay1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat2, i64 0, i64 0 + %arraydecay2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %res, i64 0, i64 0 + call void @multiply_kernel([2 x i32]* %arraydecay, [2 x i32]* %arraydecay1, [2 x i32]* %arraydecay2) + ret i32 0 +} + +define void @multiply_kernel([2 x i32]* %mat1, [2 x i32]* %mat2, [2 x i32]* %res) { +entry: + %mat1.addr = alloca [2 x i32]*, align 8 + %mat2.addr = alloca [2 x i32]*, align 8 + %res.addr = alloca [2 x i32]*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + store [2 x i32]* %mat1, [2 x i32]** %mat1.addr, align 8 + store [2 x i32]* %mat2, [2 x i32]** %mat2.addr, align 8 + store [2 x i32]* %res, [2 x i32]** %res.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc24, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 2 + br i1 %cmp, label %for.body, label %for.end26 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc21, %for.body + %1 = load i32, i32* %j, align 4 + %cmp2 = icmp slt i32 %1, 2 + br i1 %cmp2, label %for.body3, label %for.end23 + +for.body3: ; preds = %for.cond1 + %2 = load [2 x i32]*, [2 x i32]** %res.addr, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %2, i64 %idxprom + %4 = load i32, i32* %j, align 4 + %idxprom4 = sext i32 %4 to i64 + %arrayidx5 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx, i64 0, i64 %idxprom4 + store i32 0, i32* %arrayidx5, align 4 + store i32 0, i32* %k, align 4 + br label %for.cond6 + +for.cond6: ; preds = %for.inc, %for.body3 + %5 = load i32, i32* %k, align 4 + %cmp7 = icmp slt i32 %5, 2 + br i1 %cmp7, label %for.body8, label %for.end + +for.body8: ; preds = %for.cond6 + %6 = load [2 x i32]*, [2 x i32]** %mat1.addr, align 8 + %7 = load i32, i32* %i, align 4 + %idxprom9 = sext i32 %7 to i64 + %arrayidx10 = getelementptr inbounds [2 x i32], [2 x i32]* %6, i64 %idxprom9 + %8 = load i32, i32* %k, align 4 + %idxprom11 = sext i32 %8 to i64 + %arrayidx12 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx10, i64 0, i64 %idxprom11 + %9 = load i32, i32* %arrayidx12, align 4 + %10 = load [2 x i32]*, [2 x i32]** %mat2.addr, align 8 + %11 = load i32, i32* %k, align 4 + %idxprom13 = sext i32 %11 to i64 + %arrayidx14 = getelementptr inbounds [2 x i32], [2 x i32]* %10, i64 %idxprom13 + %12 = load i32, i32* %j, align 4 + %idxprom15 = sext i32 %12 to i64 + %arrayidx16 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx14, i64 0, i64 %idxprom15 + %13 = load i32, i32* %arrayidx16, align 4 + %mul = mul nsw i32 %9, %13 + %14 = load [2 x i32]*, [2 x i32]** %res.addr, align 8 + %15 = load i32, i32* %i, align 4 + %idxprom17 = sext i32 %15 to i64 + %arrayidx18 = getelementptr inbounds [2 x i32], [2 x i32]* %14, i64 %idxprom17 + %16 = load i32, i32* %j, align 4 + %idxprom19 = sext i32 %16 to i64 + %arrayidx20 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx18, i64 0, i64 %idxprom19 + %17 = load i32, i32* %arrayidx20, align 4 + %add = add nsw i32 %17, %mul + store i32 %add, i32* %arrayidx20, align 4 + br label %for.inc + +for.inc: ; preds = %for.body8 + %18 = load i32, i32* %k, align 4 + %inc = add nsw i32 %18, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond6 + +for.end: ; preds = %for.cond6 + br label %for.inc21 + +for.inc21: ; preds = %for.end + %19 = load i32, i32* %j, align 4 + %inc22 = add nsw i32 %19, 1 + store i32 %inc22, i32* %j, align 4 + br label %for.cond1 + +for.end23: ; preds = %for.cond1 + br label %for.inc24 + +for.inc24: ; preds = %for.end23 + %20 = load i32, i32* %i, align 4 + %inc25 = add nsw i32 %20, 1 + store i32 %inc25, i32* %i, align 4 + br label %for.cond + +for.end26: ; preds = %for.cond + ret void +} + +; Needed to trigger the openmp-opt pass +declare dso_local void @__kmpc_kernel_prepare_parallel(i8*) + +!nvvm.annotations = !{!2, !0, !1, !3, !1, !4, !5, !2} + +!0 = !{void ()* @empty_kernel1, !"kernel", i32 1} +!1 = !{void ()* @non_kernel, !"non_kernel", i32 1} +!2 = !{null, !"align", i32 1} +!3 = !{void ()* @empty_kernel2, !"kernel", i32 1} +!4 = !{i32 ()* @matmul_kernel, !"kernel", i32 1} +!5 = !{void ([2 x i32]*, [2 x i32]*, [2 x i32]*)* @multiply_kernel, !"kernel", i32 1}