Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -168,6 +168,16 @@ "Do not add parameter bounds and do no gist simplify sets accordingly"), cl::Hidden, cl::init(false), cl::cat(PollyCategory)); +static cl::opt PollyAllowDereferenceOfAllFunctionParams( + "polly-allow-dereference-of-all-function-parameters", + cl::desc( + "Treat all parameters to functions that are pointers as dereferencible." + " This is useful for invariant load hoisting, since we can generate" + " less runtime checks. This is only valid if all pointers to functions" + " are always initialized, so that Polly can choose to hoist" + " their loads. "), + cl::Hidden, cl::init(false), cl::cat(PollyCategory)); + static cl::opt PollyPreciseFoldAccesses( "polly-precise-fold-accesses", cl::desc("Fold memory accesses to model more possible delinearizations " @@ -3827,11 +3837,23 @@ return nullptr; } +bool isAParameter(llvm::Value *maybeParam, const Function &F) { + for (const llvm::Argument &Arg : F.args()) + if (&Arg == maybeParam) + return true; + + return false; +}; + bool Scop::canAlwaysBeHoisted(MemoryAccess *MA, bool StmtInvalidCtxIsEmpty, bool MAInvalidCtxIsEmpty, bool NonHoistableCtxIsEmpty) { LoadInst *LInst = cast(MA->getAccessInstruction()); const DataLayout &DL = LInst->getParent()->getModule()->getDataLayout(); + if (PollyAllowDereferenceOfAllFunctionParams && + isAParameter(LInst->getPointerOperand(), getFunction())) + return true; + // TODO: We can provide more information for better but more expensive // results. if (!isDereferenceableAndAlignedPointer(LInst->getPointerOperand(), Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1380,8 +1380,8 @@ /// A list of functions that are available in NVIDIA's libdevice. const std::set CUDALibDeviceFunctions = { - "exp", "expf", "expl", "cos", "cosf", - "sqrt", "sqrtf", "copysign", "copysignf", "copysignl"}; + "exp", "expf", "expl", "cos", "cosf", "sqrt", + "sqrtf", "copysign", "copysignf", "copysignl", "log", "logf"}; /// Return the corresponding CUDA libdevice function name for @p F. /// @@ -1406,7 +1406,7 @@ return F->isIntrinsic() && (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || - Name.startswith("llvm.copysign")); + Name.startswith("llvm.copysign") || Name.startswith("llvm.powi")); } /// Do not take `Function` as a subtree value. Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -483,61 +483,6 @@ return Node; } -/// Get the position of a dimension with a non-zero coefficient. -/// -/// Check that isl constraint @p Constraint has only one non-zero -/// coefficient for dimensions that have type @p DimType. If this is true, -/// return the position of the dimension corresponding to the non-zero -/// coefficient and negative value, otherwise. -/// -/// @param Constraint The isl constraint to be checked. -/// @param DimType The type of the dimensions. -/// @return The position of the dimension in case the isl -/// constraint satisfies the requirements, a negative -/// value, otherwise. -static int getMatMulConstraintDim(isl::constraint Constraint, - isl::dim DimType) { - int DimPos = -1; - auto LocalSpace = Constraint.get_local_space(); - int LocalSpaceDimNum = LocalSpace.dim(DimType); - for (int i = 0; i < LocalSpaceDimNum; i++) { - auto Val = Constraint.get_coefficient_val(DimType, i); - if (Val.is_zero()) - continue; - if (DimPos >= 0 || (DimType == isl::dim::out && !Val.is_one()) || - (DimType == isl::dim::in && !Val.is_negone())) - return -1; - DimPos = i; - } - return DimPos; -} - -/// Check the form of the isl constraint. -/// -/// Check that the @p DimInPos input dimension of the isl constraint -/// @p Constraint has a coefficient that is equal to negative one, the @p -/// DimOutPos has a coefficient that is equal to one and others -/// have coefficients equal to zero. -/// -/// @param Constraint The isl constraint to be checked. -/// @param DimInPos The input dimension of the isl constraint. -/// @param DimOutPos The output dimension of the isl constraint. -/// @return isl_stat_ok in case the isl constraint satisfies -/// the requirements, isl_stat_error otherwise. -static isl_stat isMatMulOperandConstraint(isl::constraint Constraint, - int &DimInPos, int &DimOutPos) { - auto Val = Constraint.get_constant_val(); - if (!isl_constraint_is_equality(Constraint.get()) || !Val.is_zero()) - return isl_stat_error; - DimInPos = getMatMulConstraintDim(Constraint, isl::dim::in); - if (DimInPos < 0) - return isl_stat_error; - DimOutPos = getMatMulConstraintDim(Constraint, isl::dim::out); - if (DimOutPos < 0) - return isl_stat_error; - return isl_stat_ok; -} - /// Permute the two dimensions of the isl map. /// /// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that @@ -585,30 +530,49 @@ /// second output dimension. /// @return True in case @p AccMap has the expected form and false, /// otherwise. -static bool isMatMulOperandAcc(isl::map AccMap, int &FirstPos, int &SecondPos) { - int DimInPos[] = {FirstPos, SecondPos}; - auto Lambda = [=, &DimInPos](isl::basic_map BasicMap) -> isl::stat { - auto Constraints = BasicMap.get_constraint_list(); - if (isl_constraint_list_n_constraint(Constraints.get()) != 2) - return isl::stat::error; - for (int i = 0; i < 2; i++) { - auto Constraint = - isl::manage(isl_constraint_list_get_constraint(Constraints.get(), i)); - int InPos, OutPos; - if (isMatMulOperandConstraint(Constraint, InPos, OutPos) == - isl_stat_error || - OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos)) - return isl::stat::error; - DimInPos[OutPos] = InPos; - } - return isl::stat::ok; - }; - if (AccMap.foreach_basic_map(Lambda) != isl::stat::ok || DimInPos[0] < 0 || - DimInPos[1] < 0) +static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos, + int &SecondPos) { + + isl::space Space = AccMap.get_space(); + isl::map Universe = isl::map::universe(Space); + + if (Space.dim(isl::dim::out) != 2) return false; - FirstPos = DimInPos[0]; - SecondPos = DimInPos[1]; - return true; + + // MatMul has the form: + // for (i = 0; i < N; i++) + // for (j = 0; j < M; j++) + // for (k = 0; k < P; k++) + // C[i, j] += A[i, k] * B[k, j] + // + // Permutation of three outer loops: 3! = 6 possibilities. + int FirstDims[] = {0, 0, 1, 1, 2, 2}; + int SecondDims[] = {1, 2, 2, 0, 0, 1}; + for (int i = 0; i < 6; i += 1) { + auto PossibleMatMul = + Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0) + .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1); + + AccMap = AccMap.intersect_domain(Domain); + PossibleMatMul = PossibleMatMul.intersect_domain(Domain); + + // If AccMap spans entire domain (Non-partial write), + // compute FirstPos and SecondPos. + // If AccMap != PossibleMatMul here (the two maps have been gisted at + // this point), it means that the writes are not complete, or in other + // words, it is a Partial write and Partial writes must be rejected. + if (AccMap.is_equal(PossibleMatMul)) { + if (FirstPos != -1 && FirstPos != FirstDims[i]) + continue; + FirstPos = FirstDims[i]; + if (SecondPos != -1 && SecondPos != SecondDims[i]) + continue; + SecondPos = SecondDims[i]; + return true; + } + } + + return false; } /// Does the memory access represent a non-scalar operand of the matrix @@ -627,18 +591,16 @@ if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead()) return false; auto AccMap = MemAccess->getLatestAccessRelation(); - if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j) && !MMI.ReadFromC && - isl_map_n_basic_map(AccMap.get()) == 1) { + isl::set StmtDomain = MemAccess->getStatement()->getDomain(); + if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) { MMI.ReadFromC = MemAccess; return true; } - if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k) && !MMI.A && - isl_map_n_basic_map(AccMap.get()) == 1) { + if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) { MMI.A = MemAccess; return true; } - if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j) && !MMI.B && - isl_map_n_basic_map(AccMap.get()) == 1) { + if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) { MMI.B = MemAccess; return true; } @@ -758,8 +720,7 @@ if (!MemAccessPtr->isWrite()) return false; auto AccMap = MemAccessPtr->getLatestAccessRelation(); - if (isl_map_n_basic_map(AccMap.get()) != 1 || - !isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) + if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j)) return false; MMI.WriteToC = MemAccessPtr; break; Index: test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll =================================================================== --- test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll +++ test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll @@ -4,3 +4,6 @@ define float @__nv_cosf(float %a) { ret float %a } +define float @__nv_logf(float %a) { + ret float %a +} Index: test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- test/GPGPU/intrinsic-copied-into-kernel.ll +++ test/GPGPU/intrinsic-copied-into-kernel.ll @@ -14,6 +14,7 @@ ; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) ; KERNEL-IR: declare float @llvm.sqrt.f32(float) ; KERNEL-IR: declare float @llvm.fabs.f32(float) +; KERNEL-IR: declare float @llvm.powi.f32(float, i32) ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. @@ -26,7 +27,8 @@ ; float tmp1 = sqrt(tmp1); ; float tmp2 = fabs(tmp2); ; float tmp3 = copysignf(tmp1, tmp2); -; B[i] = tmp3; +; float tmp4 = powi(tmp3, 2); +; B[i] = tmp4; ; } ; } @@ -51,8 +53,9 @@ %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) %fabs = tail call float @llvm.fabs.f32(float %sqrt); %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs); + %powi = tail call float @llvm.powi.f32(float %copysign, i32 2); %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %copysign, float* %B.arr.i, align 4 + store float %powi, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -70,6 +73,7 @@ declare float @llvm.sqrt.f32(float) #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.copysign.f32(float, float) #0 +declare float @llvm.powi.f32(float, i32) #0 attributes #0 = { nounwind readnone } Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll =================================================================== --- test/GPGPU/libdevice-functions-copied-into-kernel.ll +++ test/GPGPU/libdevice-functions-copied-into-kernel.ll @@ -20,6 +20,7 @@ ; Check that the intrinsic call is present in the kernel IR. ; KERNEL-IR: %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_) ; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf) +; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf) ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. @@ -29,9 +30,10 @@ ; void f(float *A, float *B, int N) { ; for(int i = 0; i < N; i++) { ; float tmp0 = A[i]; -; float tmp1 = expf(tmp1); -; tmp1 = cosf(tmp1); -; B[i] = tmp1; +; float expf = expf(tmp1); +; cosf = cosf(expf); +; logf = logf(cosf); +; B[i] = logf; ; } ; } @@ -55,8 +57,9 @@ ; Call to intrinsics that should be part of the kernel. %expf = tail call float @expf(float %A.arr.i.val) %cosf = tail call float @cosf(float %expf) + %logf = tail call float @logf(float %cosf) %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %expf, float* %B.arr.i, align 4 + store float %logf, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -73,6 +76,7 @@ ; Function Attrs: nounwind readnone declare float @expf(float) #0 declare float @cosf(float) #0 +declare float @logf(float) #0 attributes #0 = { nounwind readnone } Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll @@ -0,0 +1,59 @@ +; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) { +; for (int i = 0; i < 2; i+=1) +; for (int j = 0; j < 2; j+=1) +; for (int k = 0; k < 784; k+=1) +; C[i][j] += A[i][k] * B[k][j]; +;} +; +; Check that the pattern matching detects the matrix multiplication pattern +; when the AccMap cannot be reduced to a single disjunct. +; +; CHECK: The matrix multiplication pattern was detected +; +; ModuleID = 'pattern_matching_based_opts_splitmap.ll' +; +; Function Attrs: noinline nounwind uwtable +define void @pattern_matching_based_opts_splitmap([2 x double]* noalias dereferenceable(32) %C, [784 x double]* noalias dereferenceable(12544) %A, [2 x double]* noalias dereferenceable(12544) %B) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc21 + %i = phi i64 [ 0, %entry ], [ %add22, %for.inc21 ] + br label %for.body3 + +for.body3: ; preds = %for.body, %for.inc18 + %j = phi i64 [ 0, %for.body ], [ %add19, %for.inc18 ] + br label %for.body6 + +for.body6: ; preds = %for.body3, %for.body6 + %k = phi i64 [ 0, %for.body3 ], [ %add17, %for.body6 ] + %arrayidx8 = getelementptr inbounds [784 x double], [784 x double]* %A, i64 %i, i64 %k + %tmp6 = load double, double* %arrayidx8, align 8 + %arrayidx12 = getelementptr inbounds [2 x double], [2 x double]* %B, i64 %k, i64 %j + %tmp10 = load double, double* %arrayidx12, align 8 + %mul = fmul double %tmp6, %tmp10 + %arrayidx16 = getelementptr inbounds [2 x double], [2 x double]* %C, i64 %i, i64 %j + %tmp14 = load double, double* %arrayidx16, align 8 + %add = fadd double %tmp14, %mul + store double %add, double* %arrayidx16, align 8 + %add17 = add nsw i64 %k, 1 + %cmp5 = icmp slt i64 %add17, 784 + br i1 %cmp5, label %for.body6, label %for.inc18 + +for.inc18: ; preds = %for.body6 + %add19 = add nsw i64 %j, 1 + %cmp2 = icmp slt i64 %add19, 2 + br i1 %cmp2, label %for.body3, label %for.inc21 + +for.inc21: ; preds = %for.inc18 + %add22 = add nsw i64 %i, 1 + %cmp = icmp slt i64 %add22, 2 + br i1 %cmp, label %for.body, label %for.end23 + +for.end23: ; preds = %for.inc21 + ret void +} + Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop @@ -0,0 +1,46 @@ +{ + "arrays" : [ + { + "name" : "MemRef_A", + "sizes" : [ "*", "784" ], + "type" : "double" + }, + { + "name" : "MemRef_B", + "sizes" : [ "*", "2" ], + "type" : "double" + }, + { + "name" : "MemRef_C", + "sizes" : [ "*", "2" ], + "type" : "double" + } + ], + "context" : "{ : }", + "name" : "%for.body---%for.end23", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1 and 0 <= i1 <= 1 and 0 <= i2 <= 783 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }" + } + ] +} Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop.transformed =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop.transformed @@ -0,0 +1,46 @@ +{ + "arrays" : [ + { + "name" : "MemRef_A", + "sizes" : [ "*", "784" ], + "type" : "double" + }, + { + "name" : "MemRef_B", + "sizes" : [ "*", "2" ], + "type" : "double" + }, + { + "name" : "MemRef_C", + "sizes" : [ "*", "2" ], + "type" : "double" + } + ], + "context" : "{ : }", + "name" : "%for.body---%for.end23", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] : i2 <= 784 - i0 - i1; Stmt_for_body6[1, 1, 783] -> MemRef_C[1, 1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1 and 0 <= i1 <= 1 and 0 <= i2 <= 783 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }" + } + ] +} Index: test/ScopInfo/allow-all-parameters-dereferencable.ll =================================================================== --- /dev/null +++ test/ScopInfo/allow-all-parameters-dereferencable.ll @@ -0,0 +1,98 @@ +; RUN: opt %loadPolly -analyze -polly-invariant-load-hoisting \ +; RUN: -polly-allow-dereference-of-all-function-parameters \ +; RUN: -polly-scops < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \ +; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE-RTC + + +; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \ +; RUN: -polly-allow-dereference-of-all-function-parameters \ +; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE + +; SCOP: Function: hoge +; SCOP-NEXT: Region: %bb15---%bb37 +; SCOP-NEXT: Max Loop Depth: 2 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp, tmp17, tmp28] -> { Stmt_bb29[i0] -> MemRef_arg1[0] }; +; SCOP-NEXT: Execution Context: [tmp, tmp17, tmp28] -> { : } +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp, tmp17, tmp28] -> { Stmt_bb27[] -> MemRef_arg[0] }; +; SCOP-NEXT: Execution Context: [tmp, tmp17, tmp28] -> { : } +; SCOP-NEXT: } + +; Check that without the option `-pollt-allow-dereference-of-all-function-parameters` +; we do generate the runtime check. +; CODE-RTC: polly.preload.cond: ; preds = %polly.preload.begin +; CODE-RTC-NEXT: br i1 %{{[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge + +; Check that we don't generate a runtime check because we treat all +; parameters as dereferencable. +; CODE-NOT: polly.preload.cond: ; preds = %polly.preload.begin +; CODE-NOT: br i1 %{{r1:[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@global = external global i32 + +; Function Attrs: nounwind uwtable +define void @hoge(i32* noalias %arg, i32* noalias %arg1, [0 x double]* noalias %arg2, float* %A) #0 { +bb: + %tmp = load i32, i32* @global, align 4 + %tmp3 = icmp sgt i32 %tmp, 1 + br label %bb14 + +bb14: ; preds = %bb + br label %bb15 + +bb15: ; preds = %bb14 + br i1 %tmp3, label %bb16, label %bb27 + +bb16: ; preds = %bb15 + %tmp17 = load i32, i32* %arg1, align 4 + br label %bb18 + +bb18: ; preds = %bb18, %bb16 + %tmp19 = phi i32 [ %tmp25, %bb18 ], [ 1, %bb16 ] + %tmp20 = sext i32 %tmp19 to i64 + %tmp21 = add nsw i64 %tmp20, -1 + %tmp22 = getelementptr [0 x double], [0 x double]* %arg2, i64 0, i64 %tmp21 + %tmp23 = bitcast double* %tmp22 to i64* + store i64 undef, i64* %tmp23, align 8 + %tmp24 = icmp eq i32 %tmp19, %tmp17 + %tmp25 = add i32 %tmp19, 1 + br i1 %tmp24, label %bb26, label %bb18 + +bb26: ; preds = %bb18 + br label %bb27 + +bb27: ; preds = %bb26, %bb15 + %tmp28 = load i32, i32* %arg, align 4 + store float 42.0, float* %A + br label %bb29 + +bb29: ; preds = %bb35, %bb27 + %tmp30 = load i32, i32* %arg1, align 4 + store float 42.0, float* %A + br label %bb31 + +bb31: ; preds = %bb31, %bb29 + %tmp32 = phi i32 [ undef, %bb31 ], [ 1, %bb29 ] + store float 42.0, float* %A + %tmp33 = icmp eq i32 %tmp32, %tmp30 + br i1 %tmp33, label %bb34, label %bb31 + +bb34: ; preds = %bb31 + br label %bb35 + +bb35: ; preds = %bb34 + %tmp36 = icmp eq i32 0, %tmp28 + br i1 %tmp36, label %bb37, label %bb29 + +bb37: ; preds = %bb35 + ret void +} + +attributes #0 = { nounwind uwtable } Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -941,6 +941,10 @@ typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); static CuMemAllocFcnTy *CuMemAllocFcnPtr; +typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t, + unsigned int); +static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr; + typedef CUresult CUDAAPI CuLaunchKernelFcnTy( CUfunction F, unsigned int GridDimX, unsigned int GridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, @@ -1081,6 +1085,9 @@ CuMemAllocFcnPtr = (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); + CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuMemAllocManaged"); + CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); @@ -1445,7 +1452,7 @@ // If not, we pass it along to the underlying allocator. // This is a hack, and can be removed if the underlying issue is fixed. if (isManagedPtr(mem)) { - if (cudaFree(mem) != cudaSuccess) { + if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) { fprintf(stderr, "cudaFree failed.\n"); exit(-1); } @@ -1465,15 +1472,18 @@ fprintf(stderr, "cudaMallocManaged called with size 0. " "Promoting to size 1"); size = max(size, 1); - polly_initContextCUDA(); - dump_function(); - void *a; - if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) { + PollyGPUContext *_ = polly_initContextCUDA(); + assert(_ && "polly_initContextCUDA failed"); + + void *newMemPtr; + const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size, + CU_MEM_ATTACH_GLOBAL); + if (Res != CUDA_SUCCESS) { fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); exit(-1); } - addManagedPtr(a); - return a; + addManagedPtr(newMemPtr); + return newMemPtr; } static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {