Index: include/polly/TempScopInfo.h =================================================================== --- include/polly/TempScopInfo.h +++ include/polly/TempScopInfo.h @@ -54,12 +54,17 @@ public: SmallVector Subscripts, Sizes; + /// @brief The lower/upper bound for non affine accesses. + const SCEV *LB, *UB; + explicit IRAccess(TypeKind Type, Value *BaseAddress, const SCEV *Offset, unsigned elemBytes, bool Affine, SmallVector Subscripts, - SmallVector Sizes) + SmallVector Sizes, + const SCEV *LB = nullptr, const SCEV *UB = nullptr) : BaseAddress(BaseAddress), Offset(Offset), ElemBytes(elemBytes), - Type(Type), IsAffine(Affine), Subscripts(Subscripts), Sizes(Sizes) {} + Type(Type), IsAffine(Affine), Subscripts(Subscripts), Sizes(Sizes), + LB(LB), UB(UB) {} enum TypeKind getType() const { return Type; } @@ -79,6 +84,12 @@ bool isScalar() const { return Subscripts.size() == 0; } + /// @brief Return the lower bound of this access (for non-affine ones). + const SCEV *getLB() const { return LB; } + + /// @brief Return the upper bound of this access (for non-affine ones). + const SCEV *getUB() const { return UB; } + void print(raw_ostream &OS) const; }; @@ -285,6 +296,15 @@ /// instruction. IRAccess buildIRAccess(Instruction *Inst, Loop *L, Region *R); + /// @brief Build an instance of IRAccess if the call can access the memory. + /// + /// @param CI The call instruction that might access the memory + /// @param L The parent loop of the instruction + /// @param R The region on which we are going to build a TempScop + /// @param Accs The set of accesses in which we accumulate IRAccesses + void buildIRCallAccess(CallInst *CI, Loop *L, Region *R, + AccFuncSetType &Accs); + /// @brief Analyze and extract the cross-BB scalar dependences (or, /// dataflow dependencies) of an instruction. /// Index: lib/Analysis/ScopDetection.cpp =================================================================== --- lib/Analysis/ScopDetection.cpp +++ lib/Analysis/ScopDetection.cpp @@ -300,7 +300,7 @@ } bool ScopDetection::isValidCallInst(CallInst &CI) { - if (CI.mayHaveSideEffects() || CI.doesNotReturn()) + if (CI.doesNotReturn()) return false; if (CI.doesNotAccessMemory()) @@ -359,6 +359,10 @@ case llvm::Intrinsic::donothing: case llvm::Intrinsic::assume: case llvm::Intrinsic::expect: + // Memory intrinsics can be modeled, thus are supported. + case llvm::Intrinsic::memmove: + case llvm::Intrinsic::memset: + case llvm::Intrinsic::memcpy: return true; default: // Other intrinsics are not supported. Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -71,7 +71,9 @@ /// @param Stmt The location at which the scalar evolution expression /// is evaluated. /// @param Expr The expression that is translated. - static __isl_give isl_pw_aff *getPwAff(ScopStmt *Stmt, const SCEV *Expr); + /// @param NbLS The number of input dimensions if not the same as @p Stmt. + static __isl_give isl_pw_aff *getPwAff(ScopStmt *Stmt, const SCEV *Expr, + int NbLS = -1); private: isl_ctx *Ctx; @@ -101,14 +103,17 @@ : Ctx(Stmt->getIslCtx()), NbLoopSpaces(Stmt->getNumIterators()), S(Stmt->getParent()) {} -__isl_give isl_pw_aff *SCEVAffinator::getPwAff(ScopStmt *Stmt, - const SCEV *Scev) { +__isl_give isl_pw_aff *SCEVAffinator::getPwAff(ScopStmt *Stmt, const SCEV *Scev, + int NbLS) { Scop *S = Stmt->getParent(); const Region *Reg = &S->getRegion(); S->addParams(getParamsInAffineExpr(Reg, Scev, *S->getSE())); SCEVAffinator Affinator(Stmt); + if (NbLS != -1) + Affinator.NbLoopSpaces = NbLS; + return Affinator.visit(Scev); } @@ -436,14 +441,48 @@ BaseName = getIslCompatibleName("MemRef_", getBaseAddr(), ""); isl_id *BaseAddrId = isl_id_alloc(Ctx, getBaseName().c_str(), nullptr); + // Non affine accesses are either real memory accesses with access functions + // we can not analyze or special instructions (e.g., memset intrinsics) which + // write and/or read memory. The former will be overaproximatied (e.g., we + // model the access to the whole array) while the later comes with bounds. + // Since the overaproximated accesses might not write all array cells they + // are marked as MAY_WRITE. if (!Access.isAffine()) { - // We overapproximate non-affine accesses with a possible access to the - // whole array. For read accesses it does not make a difference, if an - // access must or may happen. However, for write accesses it is important to - // differentiate between writes that must happen and writes that may happen. + isl_set *LBSet, *UBSet, *Range; + isl_local_space *LocalSpace; + isl_space *Space; + isl_pw_aff *IV; + isl_aff *Zero; + AccessRelation = isl_map_from_basic_map(createBasicAccessMap(Statement)); - AccessRelation = - isl_map_set_tuple_id(AccessRelation, isl_dim_out, BaseAddrId); + AccessRelation = isl_map_set_tuple_id(AccessRelation, isl_dim_out, + isl_id_copy(BaseAddrId)); + + Range = isl_map_range(isl_map_copy(AccessRelation)); + Space = isl_set_get_space(Range); + LocalSpace = isl_local_space_from_space( + isl_space_reset_tuple_id(Space, isl_dim_set)); + Zero = isl_aff_zero_on_domain(LocalSpace); + IV = + isl_pw_aff_from_aff(isl_aff_set_coefficient_si(Zero, isl_dim_in, 0, 1)); + + if (const SCEV *LB = Access.getLB()) { + LBSet = isl_pw_aff_ge_set(isl_pw_aff_copy(IV), + SCEVAffinator::getPwAff(Statement, LB, 1)); + LBSet = isl_set_set_tuple_id(LBSet, isl_id_copy(BaseAddrId)); + AccessRelation = isl_map_intersect_range(AccessRelation, LBSet); + } + + if (const SCEV *UB = Access.getUB()) { + UBSet = isl_pw_aff_le_set(isl_pw_aff_copy(IV), + SCEVAffinator::getPwAff(Statement, UB, 1)); + UBSet = isl_set_set_tuple_id(UBSet, isl_id_copy(BaseAddrId)); + AccessRelation = isl_map_intersect_range(AccessRelation, UBSet); + } + + isl_id_free(BaseAddrId); + isl_set_free(Range); + isl_pw_aff_free(IV); return; } Index: lib/Analysis/TempScopInfo.cpp =================================================================== --- lib/Analysis/TempScopInfo.cpp +++ lib/Analysis/TempScopInfo.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -188,6 +189,77 @@ Subscripts, Sizes); } +void TempScopInfo::buildIRCallAccess(CallInst *CI, Loop *L, Region *R, + AccFuncSetType &Accs) { + // Trivial calls can be ignored. + if (CI->doesNotAccessMemory()) + return; + + SmallVector Subscripts, Sizes; + + const SCEVUnknown *WriteBasePtr = nullptr, *ReadBasePtr = nullptr; + const SCEV *Len = nullptr, *WriteLen = nullptr, *ReadLen = nullptr; + const SCEV *WriteSize = nullptr, *ReadSize = nullptr; + const SCEV *WriteAF = nullptr, *ReadAF = nullptr; + const SCEV *WriteUB = nullptr, *ReadUB = nullptr; + unsigned RSize, WSize; + + // Handle memory intrinsics with well defined read and write effects. + if (auto *IT = dyn_cast(CI)) { + switch (IT->getIntrinsicID()) { + case llvm::Intrinsic::memmove: + case llvm::Intrinsic::memcpy: + ReadAF = SE->getSCEVAtScope(IT->getOperand(1), L); + ReadBasePtr = dyn_cast(SE->getPointerBase(ReadAF)); + assert(ReadBasePtr && "Could not find base pointer"); + ReadAF = SE->getMinusSCEV(ReadAF, ReadBasePtr); + RSize = + TD->getTypeStoreSize(ReadBasePtr->getType()->getPointerElementType()); + ReadSize = SE->getConstant(ZeroOffset->getType(), RSize); + + // Fall through + case llvm::Intrinsic::memset: + WriteAF = SE->getSCEVAtScope(IT->getOperand(0), L); + WriteBasePtr = dyn_cast(SE->getPointerBase(WriteAF)); + assert(WriteBasePtr && "Could not find base pointer"); + WriteAF = SE->getMinusSCEV(WriteAF, WriteBasePtr); + + WSize = TD->getTypeStoreSize( + WriteBasePtr->getType()->getPointerElementType()); + WriteSize = SE->getConstant(ZeroOffset->getType(), WSize); + + Len = SE->getSCEVAtScope(IT->getOperand(2), L); + WriteLen = + SE->getAddExpr(Len, SE->getConstant(Len->getType(), WSize - 1)); + WriteLen = SE->getUDivExpr(WriteLen, WriteSize); + WriteUB = SE->getAddExpr(WriteLen, WriteAF); + + Accs.push_back(std::make_pair( + IRAccess(IRAccess::MUST_WRITE, WriteBasePtr->getValue(), WriteAF, + WSize, false, Subscripts, Sizes, WriteAF, WriteUB), + CI)); + + if (ReadBasePtr) { + ReadLen = + SE->getAddExpr(Len, SE->getConstant(Len->getType(), RSize - 1)); + ReadLen = SE->getUDivExpr(ReadLen, ReadSize); + ReadUB = SE->getAddExpr(ReadLen, ReadAF); + + Accs.push_back(std::make_pair( + IRAccess(IRAccess::READ, ReadBasePtr->getValue(), ReadAF, RSize, + false, Subscripts, Sizes, ReadAF, ReadUB), + CI)); + } + + return; + default: + break; + } + } + + llvm_unreachable("Unknown call instruction, cannot model the effect!"); +} + void TempScopInfo::buildAccessFunctions(Region &R, BasicBlock &BB) { AccFuncSetType Functions; Loop *L = LI->getLoopFor(&BB); @@ -197,6 +269,11 @@ if (isa(Inst) || isa(Inst)) Functions.push_back(std::make_pair(buildIRAccess(Inst, L, &R), Inst)); + // Some call accesses and intrinsics might access the memory. If so we + // build over approximated IRAccesses for them. + if (CallInst *CI = dyn_cast(Inst)) + buildIRCallAccess(CI, L, &R, Functions); + if (!isa(Inst) && buildScalarDependences(Inst, &R)) { // If the Instruction is used outside the statement, we need to build the // write access. Index: test/ScopInfo/memcpy.ll =================================================================== --- /dev/null +++ test/ScopInfo/memcpy.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -basicaa -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Stmt_for_body3 +; CHECK: Domain := +; CHECK: { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 1023 and i1 >= 0 and i1 <= 1023 }; +; CHECK: Scattering := +; CHECK: { Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }; +; CHECK: MustWriteAccess := [Reduction Type: NONE] +; CHECK: { Stmt_for_body3[i0, i1] -> MemRef_A[o0] : o0 >= -16 and o0 <= -6 }; +; CHECK: ReadAccess := [Reduction Type: NONE] +; CHECK: { Stmt_for_body3[i0, i1] -> MemRef_B[o0] : o0 >= 64 and o0 <= 69 }; +; +; #include +; +; void jd(int *restrict A, long *restrict B) { +; for (int i = 0; i < 1024; i++) +; for (int j = 0; j < 1024; j++) +; memcpy(A - 4, B + 8, 37); +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @jd(i32* noalias %A, i64* noalias %B) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc5, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc6, %for.inc5 ] + %exitcond1 = icmp ne i32 %i.0, 1024 + br i1 %exitcond1, label %for.body, label %for.end7 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, 1024 + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %add.ptr = getelementptr inbounds i32* %A, i64 -4 + %tmp = bitcast i32* %add.ptr to i8* + %add.ptr4 = getelementptr inbounds i64* %B, i64 8 + %tmp2 = bitcast i64* %add.ptr4 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* %tmp2, i64 37, i32 4, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc5 + +for.inc5: ; preds = %for.end + %inc6 = add nsw i32 %i.0, 1 + br label %for.cond + +for.end7: ; preds = %for.cond + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 + Index: test/ScopInfo/memmove.ll =================================================================== --- /dev/null +++ test/ScopInfo/memmove.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -basicaa -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Stmt_for_body3 +; CHECK: Domain := +; CHECK: { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 1023 and i1 >= 0 and i1 <= 1023 }; +; CHECK: Scattering := +; CHECK: { Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }; +; CHECK: MustWriteAccess := [Reduction Type: NONE] +; CHECK: { Stmt_for_body3[i0, i1] -> MemRef_A[o0] : o0 >= -16 and o0 <= -8 }; +; CHECK: ReadAccess := [Reduction Type: NONE] +; CHECK: { Stmt_for_body3[i0, i1] -> MemRef_B[o0] : o0 >= 64 and o0 <= 68 }; +; +; #include +; +; void jd(int *restrict A, long *restrict B) { +; for (int i = 0; i < 1024; i++) +; for (int j = 0; j < 1024; j++) +; memmove(A - 4, B + 8, 32); +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @jd(i32* noalias %A, i64* noalias %B) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc5, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc6, %for.inc5 ] + %exitcond1 = icmp ne i32 %i.0, 1024 + br i1 %exitcond1, label %for.body, label %for.end7 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, 1024 + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %add.ptr = getelementptr inbounds i32* %A, i64 -4 + %tmp = bitcast i32* %add.ptr to i8* + %add.ptr4 = getelementptr inbounds i64* %B, i64 8 + %tmp2 = bitcast i64* %add.ptr4 to i8* + call void @llvm.memmove.p0i8.p0i8.i64(i8* %tmp, i8* %tmp2, i64 32, i32 4, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc5 + +for.inc5: ; preds = %for.end + %inc6 = add nsw i32 %i.0, 1 + br label %for.cond + +for.end7: ; preds = %for.cond + ret void +} + +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 + Index: test/ScopInfo/memset.ll =================================================================== --- /dev/null +++ test/ScopInfo/memset.ll @@ -0,0 +1,59 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Stmt_for_body3 +; CHECK: Domain := +; CHECK: { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 1023 and i1 >= 0 and i1 <= 1023 }; +; CHECK: Scattering := +; CHECK: { Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }; +; CHECK: MustWriteAccess := [Reduction Type: NONE] +; CHECK: { Stmt_for_body3[i0, i1] -> MemRef_A[o0] : o0 >= 0 and o0 <= 47 }; +; +; #include +; +; void jd(int *A) { +; for (int i = 0; i < 1024; i++) +; for (int j = 0; j < 1024; j++) +; memset(A, '$', 187); +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @jd(i32* noalias %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc4, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ] + %exitcond1 = icmp ne i32 %i.0, 1024 + br i1 %exitcond1, label %for.body, label %for.end6 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, 1024 + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %tmp = bitcast i32* %A to i8* + call void @llvm.memset.p0i8.i64(i8* %tmp, i8 36, i64 187, i32 4, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc4 + +for.inc4: ; preds = %for.end + %inc5 = add nsw i32 %i.0, 1 + br label %for.cond + +for.end6: ; preds = %for.cond + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 +