Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -282,6 +282,9 @@ /// For indirect accesses return the origin SAI of the BP, else null. const ScopArrayInfo *getBasePtrOriginSAI() const { return BasePtrOriginSAI; } + /// Return whether the ScopArrayInfo models a Fortran array. + bool isFortranArray() const { return FAD != nullptr; } + /// The set of derived indirect SAIs for this origin SAI. const SmallSetVector &getDerivedSAIs() const { return DerivedSAIs; @@ -2731,6 +2734,16 @@ /// that has name @p Name. ScopArrayInfo *getArrayInfoByName(const std::string BaseName); + // Return whether this Scop contains a Fortran array. + bool hasFortranArrays() const { + for (auto &S : *this) { + for (auto MemAcc : S) { + if (MemAcc->getLatestScopArrayInfo()->isFortranArray()) + return true; + } + } + return false; + } /// Check whether @p Schedule contains extension nodes. /// /// @return true if @p Schedule contains extension nodes. Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -4645,7 +4645,7 @@ static isl::multi_union_pw_aff mapToDimension(isl::union_set USet, int N) { assert(N >= 0); assert(USet); - assert(!USet.is_empty()); + assert(!bool(USet.is_empty())); auto Result = isl::union_pw_multi_aff::empty(USet.get_space()); Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -1436,7 +1436,7 @@ // Materialize values for the parameters of the SCoP. materializeParameters(); - // materialize the outermost dimension parameters for a Fortran array. + // Materialize the outermost dimension parameters for a Fortran array. // NOTE: materializeParameters() does not work since it looks through // the SCEVs. We don't have a corresponding SCEV for the array size // parameter Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -127,8 +127,20 @@ isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); for (MemoryAccess *Acc : *Stmt) { - isl_map *AddrFunc = Acc->getAddressFunction(); - AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); + isl_map *AddrFunc = nullptr; + + if (Acc->isAffine()) { + AddrFunc = Acc->getAddressFunction(); + AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); + + } else { + errs() << "@@Access: "; + Acc->dump(); + llvm_unreachable("Cannot codegen for GPU backend with non-affine access"); + return nullptr; + } + + assert(AddrFunc && "expected AddrFunc to be initialized."); isl_id *RefId = Acc->getId(); isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); @@ -138,7 +150,6 @@ Access = FunctionExpr(Access, RefId, UserExpr); RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); } - return RefToExpr; } @@ -2134,6 +2145,7 @@ return isl_set_universe(Array->getSpace()); } + isl_set *Extent = isl_set_universe(Array->getSpace()); isl_set *AccessSet = isl_union_set_extract_set(AccessUSet, Array->getSpace()); @@ -2143,24 +2155,34 @@ isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); - isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); - isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); - OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, - isl_pw_aff_dim(Val, isl_dim_in)); - OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, - isl_pw_aff_dim(Val, isl_dim_in)); - OuterMin = - isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); - OuterMax = - isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); + // In case the array is a Fortran array, we allow unbounded + // outermost dimensions, since we can load the outermost dimension + // information at runtime. + if (Array->isFortranArray() && + !isl_set_dim_is_bounded(AccessSet, isl_dim_set, 0)) { + isl_set_free(AccessSet); + isl_pw_aff_free(Val); + } else { - isl_set *Extent = isl_set_universe(Array->getSpace()); + isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); + isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); + OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, + isl_pw_aff_dim(Val, isl_dim_in)); + OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, + isl_pw_aff_dim(Val, isl_dim_in)); + OuterMin = + isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); + OuterMax = + isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); + + Extent = isl_set_intersect( + Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); + Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); + } - Extent = isl_set_intersect( - Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); - Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); + int StartLowerBoundDim = Array->isFortranArray() ? 0 : 1; - for (unsigned i = 1; i < NumDims; ++i) + for (unsigned i = StartLowerBoundDim; i < NumDims; ++i) Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); for (unsigned i = 0; i < NumDims; ++i) { @@ -2424,7 +2446,8 @@ // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer // is mostly CPU specific. Instead, we use PPCG's GPU code generation // strategy directly from this pass. - gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { + gpu_gen *generateGPU(bool HasFortranArrays, ppcg_scop *PPCGScop, + gpu_prog *PPCGProg) { auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); @@ -2450,7 +2473,9 @@ int has_permutable = has_any_permutable_node(Schedule); - if (!has_permutable || has_permutable < 0) { + // TODO: I've simply allowed this to test out the codegen, is this a bad + // idea? + if ((!has_permutable || has_permutable < 0) && !HasFortranArrays) { Schedule = isl_schedule_free(Schedule); } else { Schedule = map_to_device(PPCGGen, Schedule); @@ -2683,7 +2708,8 @@ auto PPCGScop = createPPCGScop(); auto PPCGProg = createPPCGProg(PPCGScop); - auto PPCGGen = generateGPU(PPCGScop, PPCGProg); + auto PPCGGen = + generateGPU(CurrentScop.hasFortranArrays(), PPCGScop, PPCGProg); if (PPCGGen->tree) generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); Index: test/GPGPU/fortran-copy-kernel-affine.ll =================================================================== --- /dev/null +++ test/GPGPU/fortran-copy-kernel-affine.ll @@ -0,0 +1,134 @@ +; RUN: opt -analyze %loadPolly -polly-allow-nonaffine -polly-ignore-aliasing -polly-scops -polly-detect-fortran-arrays \ +; RUN: < %s | FileCheck %s -check-prefix=SCOPS + +; RUN: opt %loadPolly -polly-allow-nonaffine -polly-ignore-aliasing -polly-detect-fortran-arrays -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | FileCheck -check-prefix=CODE %s + +; REQUIRES: pollyacc + +; Check that Fortran arrays are detected. +; SCOPS: ReadAccess := [Reduction Type: NONE] [Fortran array descriptor: xs] [Scalar: 0] +; SCOPS-NEXT: [tmp11, tmp7, p_2, tmp1, p_4, MemRef_tmp10_fortranarr_size, MemRef_tmp5_fortranarr_size] -> { Stmt_9[i0] -> MemRef_tmp10[1 + i0, -p_2] }; +; SCOPS-NEXT: MustWriteAccess := [Reduction Type: NONE] [Fortran array descriptor: ys] [Scalar: 0] +; SCOPS-NEXT: [tmp11, tmp7, p_2, tmp1, p_4, MemRef_tmp10_fortranarr_size, MemRef_tmp5_fortranarr_size] -> { Stmt_9[i0] -> MemRef_tmp5[1 + i0, -p_4] }; + +; Check that we generate CUDA calls +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: if (tmp11 >= 1) { +; CODE-NEXT: if (tmp7 >= 1 && MemRef_tmp10_fortranarr_size >= 2) +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_tmp10, MemRef_tmp10, (MemRef_tmp10_fortranarr_size >= tmp11 + 1 ? tmp11 + 1 : MemRef_tmp10_fortranarr_size) * (tmp7) * sizeof(i32), cudaMemcpyHostToDevice)); +; CODE-NEXT: if (tmp1 >= 1 && MemRef_tmp5_fortranarr_size >= 2) +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_tmp5, MemRef_tmp5, (MemRef_tmp5_fortranarr_size >= tmp11 + 1 ? tmp11 + 1 : MemRef_tmp5_fortranarr_size) * (tmp1) * sizeof(i32), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(tmp11 >= 1048546 ? 32768 : floord(tmp11 + 31, 32)); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_tmp10, dev_MemRef_tmp5, tmp11, tmp7, p_2, tmp1, p_4, MemRef_tmp10_fortranarr_size, MemRef_tmp5_fortranarr_size); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: if (tmp1 >= 1 && MemRef_tmp5_fortranarr_size >= 2) +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_tmp5, dev_MemRef_tmp5, (MemRef_tmp5_fortranarr_size >= tmp11 + 1 ? tmp11 + 1 : MemRef_tmp5_fortranarr_size) * (tmp1) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } +; This is not the exact code that is generated from dragonegg. A non-affine access that is +; created due to error checking is manually made linear. +; +; PROGRAM main +; INTEGER, DIMENSION(100) :: xs +; INTEGER, DIMENSION(100) :: ys +; +; DO i = 1, 100 +; xs (i) = i +; ys(i) = 0 +; END DO +; +; CALL copy(xs, ys, 10) +; +; PRINT *, ys +; CONTAINS +; SUBROUTINE copy(xs, ys, n) +; IMPLICIT NONE +; INTEGER, DIMENSION(:), INTENT(INOUT) :: xs, ys +; INTEGER, INTENT(IN) :: n +; INTEGER :: i +; +; DO i = 1, n +; ys(i) = xs(i) +; END DO +; +; END SUBROUTINE copy +; END PROGRAM + + +; ModuleID = 'test/GPGPU/fortran-copy-kernel-affine.ll' +source_filename = "test/GPGPU/fortran-copy-kernel-affine.ll" +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +module asm "\09.ident\09\22GCC: (GNU) 4.6.4 LLVM: 3.3.1\22" + +%"struct.array1_integer(kind=4)" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] } +%struct.descriptor_dimension = type { i64, i64, i64 } +%"struct.array1_integer(kind=4).0" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] } + +@.cst = private constant [12 x i8] c"program.f90\00", align 8 +@0 = internal constant i32 10 +@options.14.1603 = internal constant [8 x i32] [i32 68, i32 511, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1], align 32 + +define internal void @copy.1550(%"struct.array1_integer(kind=4)"* noalias %xs, %"struct.array1_integer(kind=4).0"* noalias %ys, i32* noalias %n) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp = getelementptr inbounds %"struct.array1_integer(kind=4).0", %"struct.array1_integer(kind=4).0"* %ys, i64 0, i32 3, i64 0, i32 0 + %tmp1 = load i64, i64* %tmp, align 8 + %tmp2 = icmp eq i64 %tmp1, 0 + %tmp3 = select i1 %tmp2, i64 1, i64 %tmp1 + %tmp4 = bitcast %"struct.array1_integer(kind=4).0"* %ys to i32** + %tmp5 = load i32*, i32** %tmp4, align 8 + %tmp6 = getelementptr inbounds %"struct.array1_integer(kind=4)", %"struct.array1_integer(kind=4)"* %xs, i64 0, i32 3, i64 0, i32 0 + %tmp7 = load i64, i64* %tmp6, align 8 + %tmp8 = icmp eq i64 %tmp7, 0 + %. = select i1 %tmp8, i64 1, i64 %tmp7 + %tmp9 = bitcast %"struct.array1_integer(kind=4)"* %xs to i32** + %tmp10 = load i32*, i32** %tmp9, align 8 + %tmp11 = load i32, i32* %n, align 4 + %tmp12 = icmp sgt i32 %tmp11, 0 + br i1 %tmp12, label %"9.preheader", label %return + +"9.preheader": ; preds = %entry.split + br label %"9" + +"9": ; preds = %"9", %"9.preheader" + %tmp13 = phi i32 [ %tmp24, %"9" ], [ 1, %"9.preheader" ] + %tmp14 = sext i32 %tmp13 to i64 + ; replace %tmp3 with % tmp1 + ; %tmp15 = mul i64 %tmp3, %tmp14 + %tmp15 = mul i64 %tmp1, %tmp14 + + %tmp16 = sub i64 %tmp15, %tmp3 + %tmp17 = sext i32 %tmp13 to i64 + ; replace %. with %tmp7 + ; %tmp18 = mul i64 %., %tmp17 + %tmp18 = mul i64 %tmp7, %tmp17 + + %tmp19 = sub i64 %tmp18, %. + %tmp20 = getelementptr i32, i32* %tmp10, i64 %tmp19 + %tmp21 = load i32, i32* %tmp20, align 4 + %tmp22 = getelementptr i32, i32* %tmp5, i64 %tmp16 + store i32 %tmp21, i32* %tmp22, align 4 + %tmp23 = icmp eq i32 %tmp13, %tmp11 + %tmp24 = add i32 %tmp13, 1 + br i1 %tmp23, label %return.loopexit, label %"9" + +return.loopexit: ; preds = %"9" + br label %return + +return: ; preds = %return.loopexit, %entry.split + ret void +} + +declare void @_gfortran_set_args(i32, i8**) + +declare void @_gfortran_set_options(i32, i32*) Index: test/Isl/CodeGen/fortran_array_runtime_size_generation.ll =================================================================== --- test/Isl/CodeGen/fortran_array_runtime_size_generation.ll +++ test/Isl/CodeGen/fortran_array_runtime_size_generation.ll @@ -1,5 +1,12 @@ ; Check that the runtime size computation is generated for Fortran arrays. +; REQUIRES=pollyacc + +; PPCG code generation backend: +; RUN: opt %loadPolly -S -polly-detect-fortran-arrays \ +; RUN: -polly-target=gpu -polly-acc-mincompute=0 \ +; RUN: -polly-codegen-ppcg < %s | FileCheck %s + ; Regular code generation backend: ; RUN: opt %loadPolly -S -polly-detect-fortran-arrays \ ; RUN: -polly-codegen < %s | FileCheck %s