Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1109,6 +1109,14 @@ return isl_bool_true; } +// Check that this is a Value that is actually legal to be copied. +// If this check does not exist, then we could try to offload function +// pointers to LLVM intrinsics. +static bool IsValidSubtreeValue(Value *V) { + auto F = dyn_cast(V); + return !(F && F->isIntrinsic()); +} + SetVector GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { SetVector SubtreeValues; SetVector SCEVs; @@ -1146,7 +1154,8 @@ isl_id_free(Id); } - return SubtreeValues; + auto It = make_filter_range(SubtreeValues, IsValidSubtreeValue); + return SetVector(It.begin(), It.end()); } void GPUNodeBuilder::clearDominators(Function *F) { Index: test/GPGPU/intrinsic-not-copied-to-kernel.ll =================================================================== --- /dev/null +++ test/GPGPU/intrinsic-not-copied-to-kernel.ll @@ -0,0 +1,67 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -disable-output < %s + +; REQUIRES: pollyacc + +; static const int N = 1000; +; void f(float A[N][N], int n, float B[N][N]) { +; for(int i = 0; i < n; i++) { +; for(int j = 0; j < n; j++) { +; B[i][j] = pow(A[i][j], 3); +; } +; +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + +define void @f([1000 x float]* %A, i32 %n, [1000 x float]* %B) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp3 = icmp sgt i32 %n, 0 + br i1 %cmp3, label %for.cond1.preheader.lr.ph, label %for.end13 + +for.cond1.preheader.lr.ph: ; preds = %entry.split + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc11 + %indvars.iv5 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next6, %for.inc11 ] + %cmp21 = icmp sgt i32 %n, 0 + br i1 %cmp21, label %for.body3.lr.ph, label %for.inc11 + +for.body3.lr.ph: ; preds = %for.cond1.preheader + br label %for.body3 + +for.body3: ; preds = %for.body3.lr.ph, %for.body3 + %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [1000 x float], [1000 x float]* %A, i64 %indvars.iv5, i64 %indvars.iv + %0 = load float, float* %arrayidx5, align 4 + %conv = fpext float %0 to double + %1 = tail call double @llvm.pow.f64(double %conv, double 3.000000e+00) + %conv6 = fptrunc double %1 to float + %arrayidx10 = getelementptr inbounds [1000 x float], [1000 x float]* %B, i64 %indvars.iv5, i64 %indvars.iv + store float %conv6, float* %arrayidx10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body3, label %for.cond1.for.inc11_crit_edge + +for.cond1.for.inc11_crit_edge: ; preds = %for.body3 + br label %for.inc11 + +for.inc11: ; preds = %for.cond1.for.inc11_crit_edge, %for.cond1.preheader + %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 + %wide.trip.count7 = zext i32 %n to i64 + %exitcond8 = icmp ne i64 %indvars.iv.next6, %wide.trip.count7 + br i1 %exitcond8, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge + +for.cond.for.end13_crit_edge: ; preds = %for.inc11 + br label %for.end13 + +for.end13: ; preds = %for.cond.for.end13_crit_edge, %entry.split + ret void +} + +declare double @llvm.pow.f64(double, double)