Index: clang/lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- clang/lib/CodeGen/CGStmtOpenMP.cpp +++ clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2731,11 +2731,11 @@ CGF.CGM.getOpenMPRuntime().emitForStaticInit( CGF, S.getBeginLoc(), S.getDirectiveKind(), ScheduleKind, StaticInit); + // IV = LB; + CGF.EmitIgnoredExpr(S.getInit()); // UB = min(UB, GlobalUB); if (!StaticChunkedOne) CGF.EmitIgnoredExpr(S.getEnsureUpperBound()); - // IV = LB; - CGF.EmitIgnoredExpr(S.getInit()); // For unchunked static schedule generate: // // while (idx <= UB) { Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -187,6 +187,11 @@ __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_flush, false, Void, IdentPtr) __OMP_RTL(__kmpc_global_thread_num, false, Int32, IdentPtr) +__OMP_RTL(__kmpc_for_static_init_4, false, Void, IdentPtr, + Int32 /*gtid*/, Int32 /*sched_type*/, + Int32Ptr /*plastiter*/, + Int32Ptr /*plower*/, Int32Ptr /*pupper*/, Int32Ptr /*pstride*/, + Int32 /*incr*/, Int32 /*chunk*/) __OMP_RTL(__kmpc_fork_call, true, Void, IdentPtr, Int32, ParallelTaskPtr) __OMP_RTL(__kmpc_omp_taskwait, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_omp_taskyield, false, Int32, IdentPtr, Int32, Int32) Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -43,11 +43,16 @@ "Number of OpenMP runtime functions identified"); STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, "Number of OpenMP runtime function uses identified"); +STATISTIC(NumOpenMPLoopsRangeAnnotated, + "Number of OpenMP worksharing directives annotated with !range"); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static constexpr auto TAG = "[" DEBUG_TYPE "]"; #endif +static constexpr unsigned KMPC_FOR_STATIC_INIT_LB = 4; +static constexpr unsigned KMPC_FOR_STATIC_INIT_UB = 5; + namespace { struct OpenMPOpt { @@ -88,7 +93,7 @@ /// functions). size_t getNumArgs() const { return ArgumentTypes.size(); } - /// Run the callback \p CB on each use and forget the use if the result is + /// Run the callback \p CB on each use and forget the use if CB returns /// true. The callback will be fed the function in which the use was /// encountered as second argument. void foreachUse(function_ref CB) { @@ -114,6 +119,7 @@ Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= addRangeMetadataToOMPLoopBounds(); return Changed; } @@ -201,6 +207,170 @@ return Changed; } + /// Sets immediate valid store to 'V' preceding 'Call' & immediate valid load + /// from 'V' succeeding 'Call'. + /// In this context, a load/store is said to be valid if the value pointed by + /// 'V' is unmodified on the path to the load/store and 'Call'. + /// The loads/stores are only searched in the same BB as 'Call'. If no such + /// load/store are found, then sets them to 'nullptr'. + static void getLoadStoreGuardingCall(CallInst *Call, Value *V, + StoreInst *&ImmediateStore, + LoadInst *&ImmediateLoad) { + ImmediateStore = nullptr; + ImmediateLoad = nullptr; + + BasicBlock *BB = Call->getParent(); + + // Set ImmediateStore + BasicBlock::reverse_iterator RevIt = Call->getReverseIterator(); + ++RevIt; + for (; RevIt != BB->rend(); ++RevIt) { + if (StoreInst *SI = dyn_cast(&*RevIt)) { + if (SI->getPointerOperand() == V) { + ImmediateStore = SI; + break; + } + } + + for (Use &U : RevIt->operands()) { + // 'It' uses 'V' after a store to it => might modify + // value pointed by it. Bail. + if (U.get() == V) + break; + } + } + + // Set ImmediateLoad + auto It = Call->getIterator(); + ++It; + for (; It != BB->end(); ++It) { + if (LoadInst *LI = dyn_cast(It)) { + if (LI->getPointerOperand() == V) { + ImmediateLoad = LI; + break; + } + } + + for (Use &U : It->operands()) { + // 'It' uses 'V' before a load from it => might modify + // value pointed by it. Bail. + if (U.get() == V) + break; + } + } + } + + /// Returns true if 'R' is a stricter range than 'L'. + static bool isStricterBound(const MDNode *L, const MDNode *R) { + if (!L) + return true; + + assert(R && "'R' not allowed as nullptr."); + + int64_t LLow = + mdconst::extract(L->getOperand(0))->getSExtValue(); + int64_t RLow = + mdconst::extract(R->getOperand(0))->getSExtValue(); + + int64_t LHigh = + mdconst::extract(L->getOperand(1))->getSExtValue(); + int64_t RHigh = + mdconst::extract(R->getOperand(1))->getSExtValue(); + + return (LLow <= RLow && LHigh >= RHigh); + } + + /// Adds range metadata to loads and stores of the omp.(ub/lb). + /// A conservative range is set by setting the range equal to iteration + /// space of the underlying loop. + bool addRangeMetadataToOMPLoopBounds() { + bool Changed = false; + RuntimeFunctionInfo &RFI = RFIs[OMPRTL___kmpc_for_static_init_4]; + + if (!RFI.Declaration) + return false; + + // Helper to set the range (if possible) to each use of 'RFI'. + auto SetRangeCB = [&](Use &U, Function &F) { + CallInst *ForStaticInitCall = getCallIfRegularCall(U); + if (!ForStaticInitCall) + return false; + + Value *OMPLBVal = + ForStaticInitCall->getArgOperand(KMPC_FOR_STATIC_INIT_LB); + Value *OMPUBVal = + ForStaticInitCall->getArgOperand(KMPC_FOR_STATIC_INIT_UB); + + // current implementation only supports alloca OMP(LB/UB)Val, bail if + // that's not the case + if (!isa(OMPLBVal) || !isa(OMPUBVal)) + return false; + + StoreInst *ImmediateStoreToLB = nullptr, *ImmediateStoreToUB = nullptr; + LoadInst *ImmediateLoadFromLB = nullptr, *ImmediateLoadFromUB = nullptr; + + getLoadStoreGuardingCall(ForStaticInitCall, OMPLBVal, ImmediateStoreToLB, + ImmediateLoadFromLB); + getLoadStoreGuardingCall(ForStaticInitCall, OMPUBVal, ImmediateStoreToUB, + ImmediateLoadFromUB); + + if (!ImmediateStoreToLB || !ImmediateStoreToUB) + return false; + + ConstantInt *Low = dyn_cast( + ImmediateStoreToLB->getValueOperand()), + *High = dyn_cast( + ImmediateStoreToUB->getValueOperand()); + + if (!Low || !High) { + LLVM_DEBUG( + dbgs() + << "[addRangeMetadataToOMPLoopBounds]: Unable to set the ranges" + << " as the iteration zone of '" << *ForStaticInitCall + << "' couldn't be established as compile time constant.\n"); + return false; + } + + LLVMContext &Context = ForStaticInitCall->getParent()->getContext(); + + ConstantInt *NonInclusiveHigh = + ConstantInt::get(High->getType(), (High->getSExtValue()) + 1); + + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(Low), + ConstantAsMetadata::get(NonInclusiveHigh), + }; + + MDNode *Bound = MDNode::get(Context, LowAndHigh); + bool AnnotatedALoad = false; + + if (ImmediateLoadFromLB) + if (isStricterBound( + ImmediateLoadFromLB->getMetadata(LLVMContext::MD_range), + Bound)) { + AnnotatedALoad = true; + ImmediateLoadFromLB->setMetadata(LLVMContext::MD_range, Bound); + } + + if (ImmediateLoadFromUB) + if (isStricterBound( + ImmediateLoadFromUB->getMetadata(LLVMContext::MD_range), + Bound)) { + AnnotatedALoad = true; + ImmediateLoadFromUB->setMetadata(LLVMContext::MD_range, Bound); + } + + Changed |= AnnotatedALoad; + NumOpenMPLoopsRangeAnnotated += AnnotatedALoad; + + // Don't clear 'Use U' in 'UsesMap' + return false; + }; + + RFI.foreachUse(SetRangeCB); + return Changed; + } + static Value *combinedIdentStruct(Value *Ident0, Value *Ident1, bool GlobalOnly) { // TODO: Figure out how to actually combine multiple debug locations. For Index: llvm/test/Transforms/OpenMP/set_bound_ranges.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/set_bound_ranges.ll @@ -0,0 +1,283 @@ +; RUN: opt -openmpopt -S < %s | FileCheck %s +; RUN: opt -passes=openmpopt -S < %s | FileCheck %s +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 + +define dso_local void @loop_bounds_known_at_compile_time(i32 %N) { +;CHECK: void @loop_bounds_known_at_compile_time( +entry: + %N.addr = alloca i32, align 4 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %N, i32* %N.addr, align 4 + store i32 0, i32* %.omp.lb, align 4 + store i32 195, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %1 = load i32, i32* %.omp.lb, align 4 +; CHECK: %1 = load i32, i32* %.omp.lb, align 4, !range ![[RANGE0:[0-9]+]] + store i32 %1, i32* %.omp.iv, align 4 + %2 = load i32, i32* %.omp.ub, align 4 +; CHECK: %2 = load i32, i32* %.omp.ub, align 4, !range ![[RANGE0:[0-9]+]] + %cmp = icmp sgt i32 %2, 195 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 195, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %4 = load i32, i32* %.omp.iv, align 4 + %5 = load i32, i32* %.omp.ub, align 4 + %cmp1 = icmp sle i32 %4, %5 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %6 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %6, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %call = call i32 (...) @bar() + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %7 = load i32, i32* %.omp.iv, align 4 + %add2 = add nsw i32 %7, 1 + store i32 %add2, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %0) + ret void +} + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare dso_local i32 @bar(...) + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare dso_local void @__kmpc_barrier(%struct.ident_t*, i32) + +define dso_local void @test_parallel_loop_in_sequential_loop(i32 %N) { +;CHECK: void @test_parallel_loop_in_sequential_loop( +entry: + %N.addr = alloca i32, align 4 + %j = alloca i32, align 4 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %N, i32* %N.addr, align 4 + store i32 0, i32* %j, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %j, align 4 + %cmp = icmp slt i32 %1, 2 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i32 0, i32* %.omp.lb, align 4 + store i32 195, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.lb, align 4 +; CHECK: %2 = load i32, i32* %.omp.lb, align 4, !range ![[RANGE0:[0-9]+]] + store i32 %2, i32* %.omp.iv, align 4 + %3 = load i32, i32* %.omp.ub, align 4 +; CHECK: %3 = load i32, i32* %.omp.ub, align 4, !range ![[RANGE0:[0-9]+]] + %cmp1 = icmp sgt i32 %3, 195 + br i1 %cmp1, label %cond.true, label %cond.false + +cond.true: ; preds = %for.body + br label %cond.end + +cond.false: ; preds = %for.body + %4 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 195, %cond.true ], [ %4, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.ub, align 4 + %cmp2 = icmp sle i32 %5, %6 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %7, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %call = call i32 (...) @bar() + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %8 = load i32, i32* %.omp.iv, align 4 + %add3 = add nsw i32 %8, 1 + store i32 %add3, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %0) + br label %for.inc + +for.inc: ; preds = %omp.loop.exit + %9 = load i32, i32* %j, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %j, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define dso_local void @test_runtime_it_space(i32 %N) { +;CHECK: void @test_runtime_it_space( +entry: + %N.addr = alloca i32, align 4 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.capture_expr. = alloca i32, align 4 + %.capture_expr.1 = alloca i32, align 4 + %i = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i4 = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %N, i32* %N.addr, align 4 + %1 = load i32, i32* %N.addr, align 4 + store i32 %1, i32* %.capture_expr., align 4 + %2 = load i32, i32* %.capture_expr., align 4 + %sub = sub nsw i32 %2, 0 + %sub2 = sub nsw i32 %sub, 1 + %add = add nsw i32 %sub2, 1 + %div = sdiv i32 %add, 1 + %sub3 = sub nsw i32 %div, 1 + store i32 %sub3, i32* %.capture_expr.1, align 4 + store i32 0, i32* %i, align 4 + %3 = load i32, i32* %.capture_expr., align 4 + %cmp = icmp slt i32 0, %3 + br i1 %cmp, label %omp.precond.then, label %omp.precond.end + +omp.precond.then: ; preds = %entry + store i32 0, i32* %.omp.lb, align 4 + %4 = load i32, i32* %.capture_expr.1, align 4 + store i32 %4, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 +;CHECK: %5 = load i32, i32* %.omp.lb, align 4 +;CHECK-NOT: !range + store i32 %5, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.ub, align 4 +;CHECK: %6 = load i32, i32* %.omp.ub, align 4 +;CHECK-NOT: !range + %7 = load i32, i32* %.capture_expr.1, align 4 + %cmp5 = icmp sgt i32 %6, %7 + br i1 %cmp5, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.precond.then + %8 = load i32, i32* %.capture_expr.1, align 4 + br label %cond.end + +cond.false: ; preds = %omp.precond.then + %9 = load i32, i32* %.omp.ub, align 4 +;CHECK: %9 = load i32, i32* %.omp.ub, align 4 +;CHECK-NOT: !range + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %8, %cond.true ], [ %9, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %10 = load i32, i32* %.omp.iv, align 4 + %11 = load i32, i32* %.omp.ub, align 4 + %cmp6 = icmp sle i32 %10, %11 + br i1 %cmp6, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %12 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %12, 1 + %add7 = add nsw i32 0, %mul + store i32 %add7, i32* %i4, align 4 + %call = call i32 (...) @baz() + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %13 = load i32, i32* %.omp.iv, align 4 + %add8 = add nsw i32 %13, 1 + store i32 %add8, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) + br label %omp.precond.end + +omp.precond.end: ; preds = %omp.loop.exit, %entry + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %0) + ret void +; CHECK: ret void +} + +declare dso_local i32 @baz(...) + + +; CHECK: ![[RANGE0]] = !{i32 0, i32 196}