diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
@@ -517,8 +518,7 @@
       function_ref<OptimizationRemarkEmitter &(Function *)>;
 
   OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
-            OptimizationRemarkGetter OREGetter,
-            OMPInformationCache &OMPInfoCache, Attributor &A)
+            OptimizationRemarkGetter OREGetter, OMPInformationCache &OMPInfoCache, Attributor &A)
       : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
         OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
 
@@ -567,6 +567,8 @@
           Changed = true;
         }
       }
+
+      Changed |= injectKernelFeatures();
     }
 
     return Changed;
@@ -1470,6 +1472,49 @@
     return Changed;
   }
 
+  /// Inject the results of FunctionPropertiesAnalysis for each of the kernels
+  bool injectKernelFeatures() {
+    bool IsChanged = false;
+    for (Function *F : SCC) {
+      if (!OMPInfoCache.Kernels.count(F))
+        continue;
+
+      FunctionPropertiesInfo* FunctionInfo = OMPInfoCache.getAnalysisResultForFunction<FunctionPropertiesAnalysis>(*F);
+      if (FunctionInfo) {
+        // OMPInfoCache has returned the analysis result -- pack the feature values into the [* x i64] array
+        // and inject them as the constant global variable with the name '$KernelName.KernelFeatures'
+        IntegerType *FeatureType = Type::getInt64Ty(M.getContext());
+        ArrayRef<Constant *> FeatureArrayRef{
+            ConstantInt::get(FeatureType, FunctionInfo->BasicBlockCount),
+            ConstantInt::get(FeatureType,
+                             FunctionInfo->BlocksReachedFromConditionalInstruction),
+            ConstantInt::get(FeatureType,
+                             FunctionInfo->DirectCallsToDefinedFunctions),
+            ConstantInt::get(FeatureType, FunctionInfo->LoadInstCount),
+            ConstantInt::get(FeatureType, FunctionInfo->MaxLoopDepth),
+            ConstantInt::get(FeatureType, FunctionInfo->StoreInstCount),
+            ConstantInt::get(FeatureType, FunctionInfo->TopLevelLoopCount),
+            ConstantInt::get(FeatureType, FunctionInfo->Uses),
+        };
+        ArrayType *FeatureArrayType =
+            ArrayType::get(FeatureType, FeatureArrayRef.size());
+        Constant *FeatureArray =
+            ConstantArray::get(FeatureArrayType, FeatureArrayRef);
+        GlobalVariable *FeatureVector = new GlobalVariable(
+            M, FeatureArrayType, true, GlobalValue::ExternalLinkage,
+            FeatureArray, F->getName() + ".KernelFeatures");
+        IsChanged = true;
+        LLVM_DEBUG(FeatureArray->print(dbgs() << TAG << " Function info for kernel " << F->getName() << " is: "));
+        LLVM_DEBUG(dbgs() << "\n");
+      }
+      else {
+        // OMPInfoCache has not returned the analysis result -- notify the debug output and proceed to the next kernel
+        LLVM_DEBUG(dbgs() << TAG << " No valid function info received for kernel " << F->getName() << "\n");
+      }
+    }
+    return IsChanged;
+  }
+
   /// Collect arguments that represent the global thread id in \p GTIdArgs.
   void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
     // TODO: Below we basically perform a fixpoint iteration with a pessimistic
diff --git a/llvm/test/Transforms/OpenMP/gpu_kernel_features.ll b/llvm/test/Transforms/OpenMP/gpu_kernel_features.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/gpu_kernel_features.ll
@@ -0,0 +1,150 @@
+; RUN: opt -passes=openmp-opt-cgscc -S < %s | FileCheck %s
+
+; CHECK-DAG: @empty_kernel1.KernelFeatures = constant [8 x i64] [i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1]
+; CHECK-DAG: @empty_kernel2.KernelFeatures = constant [8 x i64] [i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1]
+; CHECK-DAG: @matmul_kernel.KernelFeatures = constant [8 x i64] [i64 1, i64 0, i64 1, i64 0, i64 0, i64 1, i64 0, i64 1]
+; CHECK-DAG: @multiply_kernel.KernelFeatures = constant [8 x i64] [i64 13, i64 6, i64 0, i64 21, i64 3, i64 11, i64 1, i64 2]
+
+define void @empty_kernel1() {
+  ret void
+}
+
+define void @empty_kernel2() {
+  ret void
+}
+
+define void @non_kernel() {
+  ret void
+}
+
+
+define i32 @matmul_kernel() {
+entry:
+  %retval = alloca i32, align 4
+  %mat1 = alloca [2 x [2 x i32]], align 16
+  %mat2 = alloca [2 x [2 x i32]], align 16
+  %res = alloca [2 x [2 x i32]], align 16
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %arraydecay = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat1, i64 0, i64 0
+  %arraydecay1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat2, i64 0, i64 0
+  %arraydecay2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %res, i64 0, i64 0
+  call void @multiply_kernel([2 x i32]* %arraydecay, [2 x i32]* %arraydecay1, [2 x i32]* %arraydecay2)
+  ret i32 0
+}
+
+define void @multiply_kernel([2 x i32]* %mat1, [2 x i32]* %mat2, [2 x i32]* %res) {
+entry:
+  %mat1.addr = alloca [2 x i32]*, align 8
+  %mat2.addr = alloca [2 x i32]*, align 8
+  %res.addr = alloca [2 x i32]*, align 8
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %k = alloca i32, align 4
+  store [2 x i32]* %mat1, [2 x i32]** %mat1.addr, align 8
+  store [2 x i32]* %mat2, [2 x i32]** %mat2.addr, align 8
+  store [2 x i32]* %res, [2 x i32]** %res.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc24, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 2
+  br i1 %cmp, label %for.body, label %for.end26
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %j, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc21, %for.body
+  %1 = load i32, i32* %j, align 4
+  %cmp2 = icmp slt i32 %1, 2
+  br i1 %cmp2, label %for.body3, label %for.end23
+
+for.body3:                                        ; preds = %for.cond1
+  %2 = load [2 x i32]*, [2 x i32]** %res.addr, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %2, i64 %idxprom
+  %4 = load i32, i32* %j, align 4
+  %idxprom4 = sext i32 %4 to i64
+  %arrayidx5 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx, i64 0, i64 %idxprom4
+  store i32 0, i32* %arrayidx5, align 4
+  store i32 0, i32* %k, align 4
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.inc, %for.body3
+  %5 = load i32, i32* %k, align 4
+  %cmp7 = icmp slt i32 %5, 2
+  br i1 %cmp7, label %for.body8, label %for.end
+
+for.body8:                                        ; preds = %for.cond6
+  %6 = load [2 x i32]*, [2 x i32]** %mat1.addr, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds [2 x i32], [2 x i32]* %6, i64 %idxprom9
+  %8 = load i32, i32* %k, align 4
+  %idxprom11 = sext i32 %8 to i64
+  %arrayidx12 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx10, i64 0, i64 %idxprom11
+  %9 = load i32, i32* %arrayidx12, align 4
+  %10 = load [2 x i32]*, [2 x i32]** %mat2.addr, align 8
+  %11 = load i32, i32* %k, align 4
+  %idxprom13 = sext i32 %11 to i64
+  %arrayidx14 = getelementptr inbounds [2 x i32], [2 x i32]* %10, i64 %idxprom13
+  %12 = load i32, i32* %j, align 4
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx14, i64 0, i64 %idxprom15
+  %13 = load i32, i32* %arrayidx16, align 4
+  %mul = mul nsw i32 %9, %13
+  %14 = load [2 x i32]*, [2 x i32]** %res.addr, align 8
+  %15 = load i32, i32* %i, align 4
+  %idxprom17 = sext i32 %15 to i64
+  %arrayidx18 = getelementptr inbounds [2 x i32], [2 x i32]* %14, i64 %idxprom17
+  %16 = load i32, i32* %j, align 4
+  %idxprom19 = sext i32 %16 to i64
+  %arrayidx20 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx18, i64 0, i64 %idxprom19
+  %17 = load i32, i32* %arrayidx20, align 4
+  %add = add nsw i32 %17, %mul
+  store i32 %add, i32* %arrayidx20, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body8
+  %18 = load i32, i32* %k, align 4
+  %inc = add nsw i32 %18, 1
+  store i32 %inc, i32* %k, align 4
+  br label %for.cond6
+
+for.end:                                          ; preds = %for.cond6
+  br label %for.inc21
+
+for.inc21:                                        ; preds = %for.end
+  %19 = load i32, i32* %j, align 4
+  %inc22 = add nsw i32 %19, 1
+  store i32 %inc22, i32* %j, align 4
+  br label %for.cond1
+
+for.end23:                                        ; preds = %for.cond1
+  br label %for.inc24
+
+for.inc24:                                        ; preds = %for.end23
+  %20 = load i32, i32* %i, align 4
+  %inc25 = add nsw i32 %20, 1
+  store i32 %inc25, i32* %i, align 4
+  br label %for.cond
+
+for.end26:                                        ; preds = %for.cond
+  ret void
+}
+
+; Needed to trigger the openmp-opt pass
+declare dso_local void @__kmpc_kernel_prepare_parallel(i8*)
+
+!nvvm.annotations = !{!2, !0, !1, !3, !1, !4, !5, !2}
+
+!0 = !{void ()* @empty_kernel1, !"kernel", i32 1}
+!1 = !{void ()* @non_kernel, !"non_kernel", i32 1}
+!2 = !{null, !"align", i32 1}
+!3 = !{void ()* @empty_kernel2, !"kernel", i32 1}
+!4 = !{i32 ()* @matmul_kernel, !"kernel", i32 1}
+!5 = !{void ([2 x i32]*, [2 x i32]*, [2 x i32]*)* @multiply_kernel, !"kernel", i32 1}