Index: lib/Analysis/ScopInfo.cpp
===================================================================
--- lib/Analysis/ScopInfo.cpp
+++ lib/Analysis/ScopInfo.cpp
@@ -168,6 +168,16 @@
         "Do not add parameter bounds and do no gist simplify sets accordingly"),
     cl::Hidden, cl::init(false), cl::cat(PollyCategory));
 
+static cl::opt<bool> PollyAllowDereferenceOfAllFunctionParams(
+    "polly-allow-dereference-of-all-function-parameters",
+    cl::desc(
+        "Treat all parameters to functions that are pointers as dereferencible."
+        " This is useful for invariant load hoisting, since we can generate"
+        " less runtime checks. This is only valid if all pointers to functions"
+        " are always initialized, so that Polly can choose to hoist"
+        " their loads. "),
+    cl::Hidden, cl::init(false), cl::cat(PollyCategory));
+
 static cl::opt<bool> PollyPreciseFoldAccesses(
     "polly-precise-fold-accesses",
     cl::desc("Fold memory accesses to model more possible delinearizations "
@@ -3827,11 +3837,23 @@
   return nullptr;
 }
 
+bool isAParameter(llvm::Value *maybeParam, const Function &F) {
+  for (const llvm::Argument &Arg : F.args())
+    if (&Arg == maybeParam)
+      return true;
+
+  return false;
+};
+
 bool Scop::canAlwaysBeHoisted(MemoryAccess *MA, bool StmtInvalidCtxIsEmpty,
                               bool MAInvalidCtxIsEmpty,
                               bool NonHoistableCtxIsEmpty) {
   LoadInst *LInst = cast<LoadInst>(MA->getAccessInstruction());
   const DataLayout &DL = LInst->getParent()->getModule()->getDataLayout();
+  if (PollyAllowDereferenceOfAllFunctionParams &&
+      isAParameter(LInst->getPointerOperand(), getFunction()))
+    return true;
+
   // TODO: We can provide more information for better but more expensive
   //       results.
   if (!isDereferenceableAndAlignedPointer(LInst->getPointerOperand(),
Index: lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- lib/CodeGen/PPCGCodeGeneration.cpp
+++ lib/CodeGen/PPCGCodeGeneration.cpp
@@ -1380,8 +1380,8 @@
 
 /// A list of functions that are available in NVIDIA's libdevice.
 const std::set<std::string> CUDALibDeviceFunctions = {
-    "exp",  "expf",  "expl",     "cos",       "cosf",
-    "sqrt", "sqrtf", "copysign", "copysignf", "copysignl"};
+    "exp",   "expf",     "expl",      "cos",       "cosf", "sqrt",
+    "sqrtf", "copysign", "copysignf", "copysignl", "log",  "logf"};
 
 /// Return the corresponding CUDA libdevice function name for @p F.
 ///
@@ -1406,7 +1406,7 @@
 
   return F->isIntrinsic() &&
          (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
-          Name.startswith("llvm.copysign"));
+          Name.startswith("llvm.copysign") || Name.startswith("llvm.powi"));
 }
 
 /// Do not take `Function` as a subtree value.
Index: lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- lib/Transform/ScheduleOptimizer.cpp
+++ lib/Transform/ScheduleOptimizer.cpp
@@ -483,61 +483,6 @@
   return Node;
 }
 
-/// Get the position of a dimension with a non-zero coefficient.
-///
-/// Check that isl constraint @p Constraint has only one non-zero
-/// coefficient for dimensions that have type @p DimType. If this is true,
-/// return the position of the dimension corresponding to the non-zero
-/// coefficient and negative value, otherwise.
-///
-/// @param Constraint The isl constraint to be checked.
-/// @param DimType    The type of the dimensions.
-/// @return           The position of the dimension in case the isl
-///                   constraint satisfies the requirements, a negative
-///                   value, otherwise.
-static int getMatMulConstraintDim(isl::constraint Constraint,
-                                  isl::dim DimType) {
-  int DimPos = -1;
-  auto LocalSpace = Constraint.get_local_space();
-  int LocalSpaceDimNum = LocalSpace.dim(DimType);
-  for (int i = 0; i < LocalSpaceDimNum; i++) {
-    auto Val = Constraint.get_coefficient_val(DimType, i);
-    if (Val.is_zero())
-      continue;
-    if (DimPos >= 0 || (DimType == isl::dim::out && !Val.is_one()) ||
-        (DimType == isl::dim::in && !Val.is_negone()))
-      return -1;
-    DimPos = i;
-  }
-  return DimPos;
-}
-
-/// Check the form of the isl constraint.
-///
-/// Check that the @p DimInPos input dimension of the isl constraint
-/// @p Constraint has a coefficient that is equal to negative one, the @p
-/// DimOutPos has a coefficient that is equal to one and others
-/// have coefficients equal to zero.
-///
-/// @param Constraint The isl constraint to be checked.
-/// @param DimInPos   The input dimension of the isl constraint.
-/// @param DimOutPos  The output dimension of the isl constraint.
-/// @return           isl_stat_ok in case the isl constraint satisfies
-///                   the requirements, isl_stat_error otherwise.
-static isl_stat isMatMulOperandConstraint(isl::constraint Constraint,
-                                          int &DimInPos, int &DimOutPos) {
-  auto Val = Constraint.get_constant_val();
-  if (!isl_constraint_is_equality(Constraint.get()) || !Val.is_zero())
-    return isl_stat_error;
-  DimInPos = getMatMulConstraintDim(Constraint, isl::dim::in);
-  if (DimInPos < 0)
-    return isl_stat_error;
-  DimOutPos = getMatMulConstraintDim(Constraint, isl::dim::out);
-  if (DimOutPos < 0)
-    return isl_stat_error;
-  return isl_stat_ok;
-}
-
 /// Permute the two dimensions of the isl map.
 ///
 /// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
@@ -585,30 +530,49 @@
 ///                  second output dimension.
 /// @return          True in case @p AccMap has the expected form and false,
 ///                  otherwise.
-static bool isMatMulOperandAcc(isl::map AccMap, int &FirstPos, int &SecondPos) {
-  int DimInPos[] = {FirstPos, SecondPos};
-  auto Lambda = [=, &DimInPos](isl::basic_map BasicMap) -> isl::stat {
-    auto Constraints = BasicMap.get_constraint_list();
-    if (isl_constraint_list_n_constraint(Constraints.get()) != 2)
-      return isl::stat::error;
-    for (int i = 0; i < 2; i++) {
-      auto Constraint =
-          isl::manage(isl_constraint_list_get_constraint(Constraints.get(), i));
-      int InPos, OutPos;
-      if (isMatMulOperandConstraint(Constraint, InPos, OutPos) ==
-              isl_stat_error ||
-          OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos))
-        return isl::stat::error;
-      DimInPos[OutPos] = InPos;
-    }
-    return isl::stat::ok;
-  };
-  if (AccMap.foreach_basic_map(Lambda) != isl::stat::ok || DimInPos[0] < 0 ||
-      DimInPos[1] < 0)
+static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos,
+                               int &SecondPos) {
+
+  isl::space Space = AccMap.get_space();
+  isl::map Universe = isl::map::universe(Space);
+
+  if (Space.dim(isl::dim::out) != 2)
     return false;
-  FirstPos = DimInPos[0];
-  SecondPos = DimInPos[1];
-  return true;
+
+  // MatMul has the form:
+  // for (i = 0; i < N; i++)
+  //   for (j = 0; j < M; j++)
+  //     for (k = 0; k < P; k++)
+  //       C[i, j] += A[i, k] * B[k, j]
+  //
+  // Permutation of three outer loops: 3! = 6 possibilities.
+  int FirstDims[] = {0, 0, 1, 1, 2, 2};
+  int SecondDims[] = {1, 2, 2, 0, 0, 1};
+  for (int i = 0; i < 6; i += 1) {
+    auto PossibleMatMul =
+        Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0)
+            .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1);
+
+    AccMap = AccMap.intersect_domain(Domain);
+    PossibleMatMul = PossibleMatMul.intersect_domain(Domain);
+
+    // If AccMap spans entire domain (Non-partial write),
+    // compute FirstPos and SecondPos.
+    // If AccMap != PossibleMatMul here (the two maps have been gisted at
+    // this point), it means that the writes are not complete, or in other
+    // words, it is a Partial write and Partial writes must be rejected.
+    if (AccMap.is_equal(PossibleMatMul)) {
+      if (FirstPos != -1 && FirstPos != FirstDims[i])
+        continue;
+      FirstPos = FirstDims[i];
+      if (SecondPos != -1 && SecondPos != SecondDims[i])
+        continue;
+      SecondPos = SecondDims[i];
+      return true;
+    }
+  }
+
+  return false;
 }
 
 /// Does the memory access represent a non-scalar operand of the matrix
@@ -627,18 +591,16 @@
   if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead())
     return false;
   auto AccMap = MemAccess->getLatestAccessRelation();
-  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j) && !MMI.ReadFromC &&
-      isl_map_n_basic_map(AccMap.get()) == 1) {
+  isl::set StmtDomain = MemAccess->getStatement()->getDomain();
+  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) {
     MMI.ReadFromC = MemAccess;
     return true;
   }
-  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k) && !MMI.A &&
-      isl_map_n_basic_map(AccMap.get()) == 1) {
+  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) {
     MMI.A = MemAccess;
     return true;
   }
-  if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j) && !MMI.B &&
-      isl_map_n_basic_map(AccMap.get()) == 1) {
+  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) {
     MMI.B = MemAccess;
     return true;
   }
@@ -758,8 +720,7 @@
     if (!MemAccessPtr->isWrite())
       return false;
     auto AccMap = MemAccessPtr->getLatestAccessRelation();
-    if (isl_map_n_basic_map(AccMap.get()) != 1 ||
-        !isMatMulOperandAcc(AccMap, MMI.i, MMI.j))
+    if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j))
       return false;
     MMI.WriteToC = MemAccessPtr;
     break;
Index: test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
===================================================================
--- test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
+++ test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
@@ -4,3 +4,6 @@
 define float @__nv_cosf(float %a) {
   ret float %a
 }
+define float @__nv_logf(float %a) {
+  ret float %a
+}
Index: test/GPGPU/intrinsic-copied-into-kernel.ll
===================================================================
--- test/GPGPU/intrinsic-copied-into-kernel.ll
+++ test/GPGPU/intrinsic-copied-into-kernel.ll
@@ -14,6 +14,7 @@
 ; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
 ; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
 ; KERNEL-IR:   declare float @llvm.fabs.f32(float)
+; KERNEL-IR:   declare float @llvm.powi.f32(float, i32)
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
@@ -26,7 +27,8 @@
 ;       float tmp1 = sqrt(tmp1);
 ;       float tmp2 = fabs(tmp2);
 ;       float tmp3 = copysignf(tmp1, tmp2);
-;       B[i] = tmp3;
+;       float tmp4 = powi(tmp3, 2);
+;       B[i] = tmp4;
 ;   }
 ; }
 
@@ -51,8 +53,9 @@
   %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
   %fabs = tail call float @llvm.fabs.f32(float %sqrt);
   %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
+  %powi = tail call float @llvm.powi.f32(float %copysign, i32 2);
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %copysign, float* %B.arr.i, align 4
+  store float %powi, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -70,6 +73,7 @@
 declare float @llvm.sqrt.f32(float) #0
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.copysign.f32(float, float) #0
+declare float @llvm.powi.f32(float, i32) #0
 
 attributes #0 = { nounwind readnone }
 
Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll
===================================================================
--- test/GPGPU/libdevice-functions-copied-into-kernel.ll
+++ test/GPGPU/libdevice-functions-copied-into-kernel.ll
@@ -20,6 +20,7 @@
 ; Check that the intrinsic call is present in the kernel IR.
 ; KERNEL-IR:   %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_)
 ; KERNEL-IR:   %p_cosf = tail call float @__nv_cosf(float %p_expf)
+; KERNEL-IR:   %p_logf = tail call float @__nv_logf(float %p_cosf)
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
@@ -29,9 +30,10 @@
 ; void f(float *A, float *B, int N) {
 ;   for(int i = 0; i < N; i++) {
 ;       float tmp0 = A[i];
-;       float tmp1 = expf(tmp1);
-;       tmp1 = cosf(tmp1);
-;       B[i] = tmp1;
+;       float expf  = expf(tmp1);
+;       cosf = cosf(expf);
+;       logf = logf(cosf);
+;       B[i] = logf;
 ;   }
 ; }
 
@@ -55,8 +57,9 @@
   ; Call to intrinsics that should be part of the kernel.
   %expf = tail call float @expf(float %A.arr.i.val)
   %cosf = tail call float @cosf(float %expf)
+  %logf = tail call float @logf(float %cosf)
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %expf, float* %B.arr.i, align 4
+  store float %logf, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -73,6 +76,7 @@
 ; Function Attrs: nounwind readnone
 declare float @expf(float) #0
 declare float @cosf(float) #0
+declare float @logf(float) #0
 
 attributes #0 = { nounwind readnone }
 
Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
@@ -0,0 +1,59 @@
+; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) {
+;  for (int i = 0; i < 2; i+=1)
+;    for (int j = 0; j < 2; j+=1)
+;      for (int k = 0; k < 784; k+=1)
+;        C[i][j] += A[i][k] * B[k][j];
+;}
+;
+; Check that the pattern matching detects the matrix multiplication pattern
+; when the AccMap cannot be reduced to a single disjunct.
+;
+; CHECK: The matrix multiplication pattern was detected
+;
+; ModuleID = 'pattern_matching_based_opts_splitmap.ll'
+;
+; Function Attrs: noinline nounwind uwtable
+define void @pattern_matching_based_opts_splitmap([2 x double]* noalias dereferenceable(32) %C, [784 x double]* noalias dereferenceable(12544) %A, [2 x double]* noalias dereferenceable(12544) %B) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc21
+  %i = phi i64 [ 0, %entry ], [ %add22, %for.inc21 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.inc18
+  %j = phi i64 [ 0, %for.body ], [ %add19, %for.inc18 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body3, %for.body6
+  %k = phi i64 [ 0, %for.body3 ], [ %add17, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [784 x double], [784 x double]* %A, i64 %i, i64 %k
+  %tmp6 = load double, double* %arrayidx8, align 8
+  %arrayidx12 = getelementptr inbounds [2 x double], [2 x double]* %B, i64 %k, i64 %j
+  %tmp10 = load double, double* %arrayidx12, align 8
+  %mul = fmul double %tmp6, %tmp10
+  %arrayidx16 = getelementptr inbounds [2 x double], [2 x double]* %C, i64 %i, i64 %j
+  %tmp14 = load double, double* %arrayidx16, align 8
+  %add = fadd double %tmp14, %mul
+  store double %add, double* %arrayidx16, align 8
+  %add17 = add nsw i64 %k, 1
+  %cmp5 = icmp slt i64 %add17, 784
+  br i1 %cmp5, label %for.body6, label %for.inc18
+
+for.inc18:                                        ; preds = %for.body6
+  %add19 = add nsw i64 %j, 1
+  %cmp2 = icmp slt i64 %add19, 2
+  br i1 %cmp2, label %for.body3, label %for.inc21
+
+for.inc21:                                        ; preds = %for.inc18
+  %add22 = add nsw i64 %i, 1
+  %cmp = icmp slt i64 %add22, 2
+  br i1 %cmp, label %for.body, label %for.end23
+
+for.end23:                                        ; preds = %for.inc21
+  ret void
+}
+
Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop
@@ -0,0 +1,46 @@
+{
+   "arrays" : [
+      {
+         "name" : "MemRef_A",
+         "sizes" : [ "*", "784" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_B",
+         "sizes" : [ "*", "2" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_C",
+         "sizes" : [ "*", "2" ],
+         "type" : "double"
+      }
+   ],
+   "context" : "{  :  }",
+   "name" : "%for.body---%for.end23",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1 and 0 <= i1 <= 1 and 0 <= i2 <= 783 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }"
+      }
+   ]
+}
Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop.transformed
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop.transformed
@@ -0,0 +1,46 @@
+{
+   "arrays" : [
+      {
+         "name" : "MemRef_A",
+         "sizes" : [ "*", "784" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_B",
+         "sizes" : [ "*", "2" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_C",
+         "sizes" : [ "*", "2" ],
+         "type" : "double"
+      }
+   ],
+   "context" : "{  :  }",
+   "name" : "%for.body---%for.end23",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] : i2 <= 784 - i0 - i1; Stmt_for_body6[1, 1, 783] -> MemRef_C[1, 1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1 and 0 <= i1 <= 1 and 0 <= i2 <= 783 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }"
+      }
+   ]
+}
Index: test/ScopInfo/allow-all-parameters-dereferencable.ll
===================================================================
--- /dev/null
+++ test/ScopInfo/allow-all-parameters-dereferencable.ll
@@ -0,0 +1,98 @@
+; RUN: opt %loadPolly -analyze -polly-invariant-load-hoisting \
+; RUN: -polly-allow-dereference-of-all-function-parameters \
+; RUN: -polly-scops < %s | FileCheck %s --check-prefix=SCOP
+
+; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \
+; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE-RTC
+
+
+; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \
+; RUN: -polly-allow-dereference-of-all-function-parameters \
+; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE
+
+; SCOP:      Function: hoge
+; SCOP-NEXT: Region: %bb15---%bb37
+; SCOP-NEXT: Max Loop Depth:  2
+; SCOP-NEXT: Invariant Accesses: {
+; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; SCOP-NEXT:             [tmp, tmp17, tmp28] -> { Stmt_bb29[i0] -> MemRef_arg1[0] };
+; SCOP-NEXT:         Execution Context: [tmp, tmp17, tmp28] -> {  :  }
+; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; SCOP-NEXT:             [tmp, tmp17, tmp28] -> { Stmt_bb27[] -> MemRef_arg[0] };
+; SCOP-NEXT:         Execution Context: [tmp, tmp17, tmp28] -> {  :  }
+; SCOP-NEXT: }
+
+; Check that without the option `-pollt-allow-dereference-of-all-function-parameters`
+; we do generate the runtime check.
+; CODE-RTC: polly.preload.cond:                               ; preds = %polly.preload.begin
+; CODE-RTC-NEXT: br i1 %{{[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge
+
+; Check that we don't generate a runtime check because we treat all
+; parameters as dereferencable.
+; CODE-NOT: polly.preload.cond:                               ; preds = %polly.preload.begin
+; CODE-NOT: br i1 %{{r1:[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@global = external global i32
+
+; Function Attrs: nounwind uwtable
+define void @hoge(i32* noalias %arg, i32* noalias %arg1, [0 x double]* noalias %arg2, float* %A) #0 {
+bb:
+  %tmp = load i32, i32* @global, align 4
+  %tmp3 = icmp sgt i32 %tmp, 1
+  br label %bb14
+
+bb14:                                             ; preds = %bb
+  br label %bb15
+
+bb15:                                             ; preds = %bb14
+  br i1 %tmp3, label %bb16, label %bb27
+
+bb16:                                             ; preds = %bb15
+  %tmp17 = load i32, i32* %arg1, align 4
+  br label %bb18
+
+bb18:                                             ; preds = %bb18, %bb16
+  %tmp19 = phi i32 [ %tmp25, %bb18 ], [ 1, %bb16 ]
+  %tmp20 = sext i32 %tmp19 to i64
+  %tmp21 = add nsw i64 %tmp20, -1
+  %tmp22 = getelementptr [0 x double], [0 x double]* %arg2, i64 0, i64 %tmp21
+  %tmp23 = bitcast double* %tmp22 to i64*
+  store i64 undef, i64* %tmp23, align 8
+  %tmp24 = icmp eq i32 %tmp19, %tmp17
+  %tmp25 = add i32 %tmp19, 1
+  br i1 %tmp24, label %bb26, label %bb18
+
+bb26:                                             ; preds = %bb18
+  br label %bb27
+
+bb27:                                             ; preds = %bb26, %bb15
+  %tmp28 = load i32, i32* %arg, align 4
+  store float 42.0, float* %A
+  br label %bb29
+
+bb29:                                             ; preds = %bb35, %bb27
+  %tmp30 = load i32, i32* %arg1, align 4
+  store float 42.0, float* %A
+  br label %bb31
+
+bb31:                                             ; preds = %bb31, %bb29
+  %tmp32 = phi i32 [ undef, %bb31 ], [ 1, %bb29 ]
+  store float 42.0, float* %A
+  %tmp33 = icmp eq i32 %tmp32, %tmp30
+  br i1 %tmp33, label %bb34, label %bb31
+
+bb34:                                             ; preds = %bb31
+  br label %bb35
+
+bb35:                                             ; preds = %bb34
+  %tmp36 = icmp eq i32 0, %tmp28
+  br i1 %tmp36, label %bb37, label %bb29
+
+bb37:                                             ; preds = %bb35
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
Index: tools/GPURuntime/GPUJIT.c
===================================================================
--- tools/GPURuntime/GPUJIT.c
+++ tools/GPURuntime/GPUJIT.c
@@ -941,6 +941,10 @@
 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
 
+typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
+                                                unsigned int);
+static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;
+
 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
     CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
     unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
@@ -1081,6 +1085,9 @@
   CuMemAllocFcnPtr =
       (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
 
+  CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
+      HandleCuda, "cuMemAllocManaged");
+
   CuMemFreeFcnPtr =
       (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
 
@@ -1445,7 +1452,7 @@
   // If not, we pass it along to the underlying allocator.
   // This is a hack, and can be removed if the underlying issue is fixed.
   if (isManagedPtr(mem)) {
-    if (cudaFree(mem) != cudaSuccess) {
+    if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
       fprintf(stderr, "cudaFree failed.\n");
       exit(-1);
     }
@@ -1465,15 +1472,18 @@
     fprintf(stderr, "cudaMallocManaged called with size 0. "
                     "Promoting to size 1");
   size = max(size, 1);
-  polly_initContextCUDA();
-  dump_function();
-  void *a;
-  if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) {
+  PollyGPUContext *_ = polly_initContextCUDA();
+  assert(_ && "polly_initContextCUDA failed");
+
+  void *newMemPtr;
+  const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
+                                               CU_MEM_ATTACH_GLOBAL);
+  if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
     exit(-1);
   }
-  addManagedPtr(a);
-  return a;
+  addManagedPtr(newMemPtr);
+  return newMemPtr;
 }
 
 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {