Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -187,6 +187,11 @@ __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_flush, false, Void, IdentPtr) __OMP_RTL(__kmpc_global_thread_num, false, Int32, IdentPtr) +__OMP_RTL(__kmpc_for_static_init_4, false, Void, IdentPtr, + Int32 /*gtid*/, Int32 /*sched_type*/, + Int32Ptr /*plastiter*/, + Int32Ptr /*plower*/, Int32Ptr /*pupper*/, Int32Ptr /*pstride*/, + Int32 /*incr*/, Int32 /*chunk*/) __OMP_RTL(__kmpc_fork_call, true, Void, IdentPtr, Int32, ParallelTaskPtr) __OMP_RTL(__kmpc_omp_taskwait, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_omp_taskyield, false, Int32, IdentPtr, Int32, Int32) Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -43,11 +43,16 @@ "Number of OpenMP runtime functions identified"); STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, "Number of OpenMP runtime function uses identified"); +STATISTIC(NumOpenMPLoopsRangeAnnotated, + "Number of OpenMP worksharing directives annotated with !range"); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static constexpr auto TAG = "[" DEBUG_TYPE "]"; #endif +static constexpr unsigned KMPC_FOR_STATIC_INIT_LB = 4; +static constexpr unsigned KMPC_FOR_STATIC_INIT_UB = 5; + namespace { struct OpenMPOpt { @@ -88,7 +93,7 @@ /// functions). size_t getNumArgs() const { return ArgumentTypes.size(); } - /// Run the callback \p CB on each use and forget the use if the result is + /// Run the callback \p CB on each use and forget the use if CB returns /// true. The callback will be fed the function in which the use was /// encountered as second argument. void foreachUse(function_ref CB) { @@ -114,6 +119,7 @@ Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= addRangeMetadataToOMPLoopBounds(); return Changed; } @@ -201,6 +207,128 @@ return Changed; } + /// Returns true if 'R' is a stricter range than 'L'. + static bool isStricterBound(const MDNode *L, const MDNode *R) { + if (!L) + return true; + + assert(R && "'R' not allowed as nullptr."); + + int64_t LLow = + mdconst::extract(L->getOperand(0))->getSExtValue(); + int64_t RLow = + mdconst::extract(R->getOperand(0))->getSExtValue(); + + int64_t LHigh = + mdconst::extract(L->getOperand(1))->getSExtValue(); + int64_t RHigh = + mdconst::extract(R->getOperand(1))->getSExtValue(); + + return (LLow <= RLow && LHigh >= RHigh); + } + + /// Adds range metadata to loads and stores of the omp.(ub/lb). + /// A conservative range is set by setting the range equal to iteration + /// space of the underlying loop. + bool addRangeMetadataToOMPLoopBounds() { + bool Changed = false; + RuntimeFunctionInfo &RFI = RFIs[OMPRTL___kmpc_for_static_init_4]; + + if (!RFI.Declaration) + return false; + + auto SetRangeCB = [&](Use &U, Function &F) { + CallInst *StaticForCall = getCallIfRegularCall(U); + if (!StaticForCall) + return false; + + Value *OMPLBVal = StaticForCall->getArgOperand(KMPC_FOR_STATIC_INIT_LB); + Value *OMPUBVal = StaticForCall->getArgOperand(KMPC_FOR_STATIC_INIT_UB); + + if (!isa(OMPLBVal) || !isa(OMPUBVal)) + // Support a simple, but widely occurring case. + return false; + + StoreInst *StoreToLB = nullptr, *StoreToUB = nullptr; + LoadInst *LoadFromLB = nullptr, *LoadFromUB = nullptr; + + /// GetLoadStoreGuardingStaticFor: sets nearest store to 'BoundVal' + /// preceding 'StaticForCall' & nearest load to 'BoundVal' succeeding + /// StaticForCall' + auto GetLoadStoreGuardingStaticFor = + [StaticForCall](Value *BoundVal, LoadInst *&Load, StoreInst *&Store) { + bool FoundCall = false; + for (Value *V : BoundVal->users()) { + // FIXME: + // 1. Find the bounds that covers the iteration space from all + // possible stores to omp.ub/omp.lb that reach StaticForCall. + // 2. Find all the loads from omp.ub/omp.lb which are reached + // only by StaticForCall and set their ranges to the bound found + // in (1). + if (!FoundCall) { + if (V == StaticForCall) + FoundCall = true; + else if (isa(V)) + Load = dyn_cast(V); + } else { + if (isa(V)) { + Store = dyn_cast(V); + break; + } + } + } + }; + + GetLoadStoreGuardingStaticFor(OMPLBVal, LoadFromLB, StoreToLB); + GetLoadStoreGuardingStaticFor(OMPUBVal, LoadFromUB, StoreToUB); + + ConstantInt *Low = dyn_cast(StoreToLB->getValueOperand()); + ConstantInt *HighMinusOne = + dyn_cast(StoreToUB->getValueOperand()); + + if (!Low || !HighMinusOne) { + LLVM_DEBUG( + dbgs() + << "[addRangeMetadataToOMPLoopBounds]: Unable to set the ranges" + << " as the iteration zone of '" << *StaticForCall + << "' isn't compile time constant.\n"); + return false; + } + + LLVMContext &Context = StaticForCall->getParent()->getContext(); + + ConstantInt *High = ConstantInt::get(HighMinusOne->getType(), + (HighMinusOne->getSExtValue()) + 1); + + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(Low), + ConstantAsMetadata::get(High), + }; + + MDNode *Bound = MDNode::get(Context, LowAndHigh); + + if (isStricterBound(LoadFromLB->getMetadata(LLVMContext::MD_range), + Bound)) { + Changed = true; + LoadFromLB->setMetadata(LLVMContext::MD_range, Bound); + } + + if (isStricterBound(LoadFromUB->getMetadata(LLVMContext::MD_range), + Bound)) { + Changed = true; + LoadFromUB->setMetadata(LLVMContext::MD_range, Bound); + } + + ++NumOpenMPLoopsRangeAnnotated; + + // Don't clear 'Use U' in 'UsesMap' + return false; + }; + + RFI.foreachUse(SetRangeCB); + return Changed; + } + static Value *combinedIdentStruct(Value *Ident0, Value *Ident1, bool GlobalOnly) { // TODO: Figure out how to actually combine multiple debug locations. For Index: llvm/test/Transforms/OpenMP/set_bound_ranges.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/set_bound_ranges.ll @@ -0,0 +1,192 @@ +; RUN: opt -openmpopt -S < %s | FileCheck %s +; RUN: opt -passes=openmpopt -S < %s | FileCheck %s +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 + +define dso_local void @foo(i32 %N) { +;CHECK: void @foo( +entry: + %N.addr = alloca i32, align 4 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %N, i32* %N.addr, align 4 + store i32 0, i32* %.omp.lb, align 4 + store i32 195, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %1 = load i32, i32* %.omp.ub, align 4 +; CHECK: %1 = load i32, i32* %.omp.ub, align 4, !range ![[RANGE0:[0-9]+]] + %cmp = icmp sgt i32 %1, 195 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %2 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 195, %cond.true ], [ %2, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %3 = load i32, i32* %.omp.lb, align 4 +; CHECK: %3 = load i32, i32* %.omp.lb, align 4, !range ![[RANGE0]] + store i32 %3, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %4 = load i32, i32* %.omp.iv, align 4 + %5 = load i32, i32* %.omp.ub, align 4 + %cmp1 = icmp sle i32 %4, %5 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %6 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %6, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %call = call i32 (...) @bar() + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %7 = load i32, i32* %.omp.iv, align 4 + %add2 = add nsw i32 %7, 1 + store i32 %add2, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %0) + ret void +} + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare dso_local i32 @bar(...) + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare dso_local void @__kmpc_barrier(%struct.ident_t*, i32) + +define dso_local void @test_runtime_it_space(i32 %N) { +;CHECK: void @test_runtime_it_space( +entry: + %N.addr = alloca i32, align 4 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.capture_expr. = alloca i32, align 4 + %.capture_expr.1 = alloca i32, align 4 + %i = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i4 = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %N, i32* %N.addr, align 4 + %1 = load i32, i32* %N.addr, align 4 + store i32 %1, i32* %.capture_expr., align 4 + %2 = load i32, i32* %.capture_expr., align 4 + %sub = sub nsw i32 %2, 0 + %sub2 = sub nsw i32 %sub, 1 + %add = add nsw i32 %sub2, 1 + %div = sdiv i32 %add, 1 + %sub3 = sub nsw i32 %div, 1 + store i32 %sub3, i32* %.capture_expr.1, align 4 + store i32 0, i32* %i, align 4 + %3 = load i32, i32* %.capture_expr., align 4 + %cmp = icmp slt i32 0, %3 + br i1 %cmp, label %omp.precond.then, label %omp.precond.end + +omp.precond.then: ; preds = %entry + store i32 0, i32* %.omp.lb, align 4 + %4 = load i32, i32* %.capture_expr.1, align 4 + store i32 %4, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 +;CHECK: %5 = load i32, i32* %.omp.ub, align 4 +;CHECK-NOT: !range + %6 = load i32, i32* %.capture_expr.1, align 4 + %cmp5 = icmp sgt i32 %5, %6 + br i1 %cmp5, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.precond.then + %7 = load i32, i32* %.capture_expr.1, align 4 + br label %cond.end + +cond.false: ; preds = %omp.precond.then + %8 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ %7, %cond.true ], [ %8, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %9 = load i32, i32* %.omp.lb, align 4 +;CHECK: %9 = load i32, i32* %.omp.lb, align 4 +;CHECK-NOT: !range + store i32 %9, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %10 = load i32, i32* %.omp.iv, align 4 + %11 = load i32, i32* %.omp.ub, align 4 + %cmp6 = icmp sle i32 %10, %11 + br i1 %cmp6, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %12 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %12, 1 + %add7 = add nsw i32 0, %mul + store i32 %add7, i32* %i4, align 4 + %call = call i32 (...) @baz() + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %13 = load i32, i32* %.omp.iv, align 4 + %add8 = add nsw i32 %13, 1 + store i32 %add8, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) + br label %omp.precond.end + +omp.precond.end: ; preds = %omp.loop.exit, %entry + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %0) + ret void +;CHECK: ret void +} + +declare dso_local i32 @baz(...) + + +; CHECK: ![[RANGE0]] = !{i32 0, i32 196}