Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -1142,6 +1142,68 @@ does not alias any other memory visible within a function and that a ``swifterror`` alloca passed as an argument does not escape. +``convergent`` + In some parallel execution models, there exist operations that are executed + simultaneously for multiple threads and one or more arguments of the + operation must have the same value across all simultaneously executing + threads. Such arguments are called ``convergent``. + + While the ``convergent`` attribute on functions can be thought of as + "whether and when each call site to this function is reached must be + uniform across a (target-dependent) set of threads", the ``convergent`` + attribute on function arguments can be thought of as "the value of this + argument at each execution of each call site to this function must be + uniform across a (target-dependent) set of threads". + + For every function F, transformations must ensure that every pair of + executions of F that is "compatible" with respect to convergent function + arguments in the original function F is also "compatible" when executing + the transformed function F. "Compatibility" is defined as follows: + + Two executions e1 and e2 of a function are compatible (with respect to + convergent function arguments) if for every call site CS to a function with + a convergent argument, the sequences e1(CS) and e2(CS) of values supplied + to the convergent argument at CS during e1 and e2, respectively, satisfy + that e1(CS) is a subsequence of e2(CS) or vice versa. + + As an example to illustrate this definition, consider the following + stylized code fragment. + + .. code-block:: text + + if (cond) { + Tmp1 = Foo(v [convergent]) + } else { + Tmp2 = Foo(v [convergent]) + } + Tmp3 = phi [Tmp1, Tmp2] + + Without the restrictions imposed by the convergent attribute, this could be + simplified to: + + .. code-block:: text + + Tmp3 = Foo(v [convergent]) + + To see why this transformation is forbidden, consider executions + ``e1 = { cond = true, v = V0 }`` and ``e2 = { cond = false, v = V1 }`` with + V0 different from V1. These executions are compatible in the original code + fragment: At the first call-site of ``Foo``, the sequences are + ``e1(CS) = (V0)`` and ``e2(CS) = ()``, so ``e2(CS)`` is a subsequence of + ``e1(CS)``. The second call-site is similar. However, the executions are + incompatible in the transformed code fragment: The sequences at the + call-site are ``e1(CS) = (V0)`` and ``e2(CS) = (V1)``, so neither is a + subsequence of the other. + + Calling a function F that contains a call-site CS with convergent function + arguments is only defined behavior if the only data-dependencies of the + convergent function arguments of CS are convergent parameters of F. If this + condition is not satisfied, then the behavior in F itself is defined, but a + function G that calls F has undefined behavior. The purpose of this rule is + to allow transformations to be applied to G without having to transitively + look inside all functions called by G. + + .. _gc: Garbage Collector Strategy Names Index: include/llvm/Analysis/CodeMetrics.h =================================================================== --- include/llvm/Analysis/CodeMetrics.h +++ include/llvm/Analysis/CodeMetrics.h @@ -53,7 +53,8 @@ /// one or more 'noduplicate' instructions. bool notDuplicatable = false; - /// \brief True if this function contains a call to a convergent function. + /// \brief True if this function contains a call to a convergent function or + /// a function with convergent arguments. bool convergent = false; /// \brief True if this function calls alloca (in the C sense). Index: include/llvm/IR/Instructions.h =================================================================== --- include/llvm/IR/Instructions.h +++ include/llvm/IR/Instructions.h @@ -1775,6 +1775,18 @@ return AttributeList.hasAttrSomewhere(Attribute::ByVal); } + /// Determine if any call argument is convergent. + bool hasConvergentArgument() const { + if (AttributeList.hasAttrSomewhere(Attribute::Convergent)) + return true; + + if (const Function *F = getCalledFunction()) + if (F->getAttributes().hasAttrSomewhere(Attribute::Convergent)) + return true; + + return false; + } + /// Return the function called, or null if this is an /// indirect function invocation. /// Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -49,6 +49,12 @@ // Throws - This intrinsic can throw. def Throws : IntrinsicProperty; +// Convergent - The specified argument is convergent. This corresponds to the +// IR function argument attribute of the same name. +class Convergent : IntrinsicProperty { + int ArgNo = argNo; +} + // NoCapture - The specified argument pointer is not captured by the intrinsic. class NoCapture : IntrinsicProperty { int ArgNo = argNo; Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -240,7 +240,7 @@ llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - [IntrReadMem]>; + [IntrReadMem, Convergent<1>]>; def int_amdgcn_image_load : AMDGPUImageLoad; def int_amdgcn_image_load_mip : AMDGPUImageLoad; @@ -256,7 +256,7 @@ llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - []>; + [Convergent<2>]>; def int_amdgcn_image_store : AMDGPUImageStore; def int_amdgcn_image_store_mip : AMDGPUImageStore; @@ -272,7 +272,7 @@ llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - [IntrReadMem]>; + [IntrReadMem, Convergent<1>, Convergent<2>]>; // Basic sample def int_amdgcn_image_sample : AMDGPUImageSample; @@ -364,7 +364,7 @@ llvm_i1_ty, // r128(imm) llvm_i1_ty, // da(imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<2>]>; def int_amdgcn_image_atomic_swap : AMDGPUImageAtomic; def int_amdgcn_image_atomic_add : AMDGPUImageAtomic; @@ -387,7 +387,7 @@ llvm_i1_ty, // r128(imm) llvm_i1_ty, // da(imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<3>]>; class AMDGPUBufferLoad : Intrinsic < [llvm_anyfloat_ty], @@ -396,7 +396,7 @@ llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - [IntrReadMem]>; + [IntrReadMem, Convergent<0>]>; def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; @@ -408,7 +408,7 @@ llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - [IntrWriteMem]>; + [IntrWriteMem, Convergent<1>]>; def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; @@ -419,7 +419,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<1>]>; def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; @@ -438,7 +438,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<2>]>; def int_amdgcn_buffer_wbinvl1_sc : GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, Index: lib/Analysis/CodeMetrics.cpp =================================================================== --- lib/Analysis/CodeMetrics.cpp +++ lib/Analysis/CodeMetrics.cpp @@ -168,7 +168,7 @@ if (const CallInst *CI = dyn_cast(&I)) { if (CI->cannotDuplicate()) notDuplicatable = true; - if (CI->isConvergent()) + if (CI->isConvergent() || CI->hasConvergentArgument()) convergent = true; } Index: lib/AsmParser/LLParser.cpp =================================================================== --- lib/AsmParser/LLParser.cpp +++ lib/AsmParser/LLParser.cpp @@ -1387,6 +1387,7 @@ continue; } case lltok::kw_byval: B.addAttribute(Attribute::ByVal); break; + case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break; case lltok::kw_dereferenceable: { uint64_t Bytes; if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes)) Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -1324,7 +1324,6 @@ I->getKindAsEnum() == Attribute::Cold || I->getKindAsEnum() == Attribute::OptimizeNone || I->getKindAsEnum() == Attribute::JumpTable || - I->getKindAsEnum() == Attribute::Convergent || I->getKindAsEnum() == Attribute::ArgMemOnly || I->getKindAsEnum() == Attribute::NoRecurse || I->getKindAsEnum() == Attribute::InaccessibleMemOnly || @@ -1335,7 +1334,8 @@ "' only applies to functions!", V); return; } - } else if (I->getKindAsEnum() == Attribute::ReadOnly || + } else if (I->getKindAsEnum() == Attribute::Convergent || + I->getKindAsEnum() == Attribute::ReadOnly || I->getKindAsEnum() == Attribute::WriteOnly || I->getKindAsEnum() == Attribute::ReadNone) { if (Idx == 0) { @@ -1362,6 +1362,7 @@ if (isReturnValue) Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) && + !Attrs.hasAttribute(Idx, Attribute::Convergent) && !Attrs.hasAttribute(Idx, Attribute::Nest) && !Attrs.hasAttribute(Idx, Attribute::StructRet) && !Attrs.hasAttribute(Idx, Attribute::NoCapture) && Index: lib/Transforms/Scalar/GVNHoist.cpp =================================================================== --- lib/Transforms/Scalar/GVNHoist.cpp +++ lib/Transforms/Scalar/GVNHoist.cpp @@ -920,7 +920,7 @@ break; } - if (Call->isConvergent()) + if (Call->isConvergent() || Call->hasConvergentArgument()) break; CI.insert(Call, VN); Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1007,6 +1007,13 @@ // Assuming n is the same on all threads, any kind of unrolling is // safe. But currently llvm's notion of convergence isn't powerful // enough to express this. + // + // Similarly, if the loop contains an operation with a convergent argument, + // the prelude could shift the sequences of argument values in a way that + // affects the compatibility of runs wrt convergent function arguments. + // + // TODO: This is also conservative. If all convergent arguments that appear + // in the loop are loop invariant, then compatibility of runs is preserved. if (Convergent) UP.AllowRemainder = false; Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -1371,7 +1371,6 @@ return false; return true; - case Instruction::ShuffleVector: // Shufflevector masks are constant. return OpIdx != 2; @@ -1438,6 +1437,17 @@ // Don't touch any operand of token type. return false; + // We cannot sink calls if any argument is convergent: sinking may add + // divergence even when the argument is the same IR value in all blocks, + // because there may be divergence in the argument that was removed by the + // control flow conditions for those blocks. + if (isa(I0) || isa(I0)) { + ImmutableCallSite CS(I0); + if (OI < CS.getNumArgOperands() && + CS.paramHasAttr(OI + 1, Attribute::Convergent)) + return false; + } + // Because SROA can't handle speculating stores of selects, try not // to sink loads or stores of allocas when we'd have to create a PHI for // the address operand. Also, because it is likely that loads or stores Index: test/Bitcode/attributes.ll =================================================================== --- test/Bitcode/attributes.ll +++ test/Bitcode/attributes.ll @@ -334,6 +334,11 @@ ret void } +; CHECK: define void @f57(i32 convergent) +define void @f57(i32 convergent) { + ret void +} + ; CHECK: attributes #0 = { noreturn } ; CHECK: attributes #1 = { nounwind } ; CHECK: attributes #2 = { readnone } Index: test/Transforms/GVNHoist/hoist-convergent.ll =================================================================== --- test/Transforms/GVNHoist/hoist-convergent.ll +++ test/Transforms/GVNHoist/hoist-convergent.ll @@ -86,8 +86,35 @@ ret float %add } +; Cannot hoist because there could be divergence in the argument that was +; filtered out by the branch condition. +; +; CHECK-LABEL: @no_convergent_arg_hoisting( +; CHECK: if.then: +; CHECK: call float @convergent_arg_func( + +; CHECK: if.else: +; CHECK: call float @convergent_arg_func( +define float @no_convergent_arg_hoisting(i1 %cc, float %x) { +entry: + br i1 %cc, label %if.then, label %if.else + +if.then: + %r1 = call float @convergent_arg_func(float %x) + br label %if.end + +if.else: + %r2 = call float @convergent_arg_func(float %x) + br label %if.end + +if.end: + %r = phi float [ %r1, %if.then ], [ %r2, %if.else ] + ret float %r +} + declare float @convergent_func(float, float) #0 declare float @func(float, float) #1 +declare float @convergent_arg_func(float convergent) #1 attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind readnone } Index: test/Transforms/InstCombine/select-call.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/select-call.ll @@ -0,0 +1,35 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare i32 @f(i32 convergent %a, i32 %b) nounwind readnone + +; Must keep both calls to f here, since they're not compatible wrt convergent +; function arguments. +define i32 @check_bad(i1 %cond, i32 %v) { +; CHECK-LABEL: @check_bad( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R0:%.*]] = call i32 @f(i32 0, i32 %v) +; CHECK-NEXT: [[R1:%.*]] = call i32 @f(i32 1, i32 %v) +; CHECK-NEXT: [[OUT:%.*]] = select i1 %cond, i32 [[R0]], i32 [[R1]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %r0 = call i32 @f(i32 0, i32 %v) + %r1 = call i32 @f(i32 1, i32 %v) + %out = select i1 %cond, i32 %r0, i32 %r1 + ret i32 %out +} + +; TODO: In this case, select C, call, call -> call select would be beneficial +; and correct. +define i32 @check_good(i1 %cond, i32 %v) { +; CHECK-LABEL: @check_good +; NO-CHECK: %out.v = select i1 %cond, i32 0, i32 1 +; NO-CHECK: %out = call i32 @f(i32 %v, i32 %out.v) +; +entry: + %r0 = call i32 @f(i32 %v, i32 0) + %r1 = call i32 @f(i32 %v, i32 1) + %out = select i1 %cond, i32 %r0, i32 %r1 + ret i32 %out +} + Index: test/Transforms/LoopUnroll/convergent.ll =================================================================== --- test/Transforms/LoopUnroll/convergent.ll +++ test/Transforms/LoopUnroll/convergent.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s declare void @f() convergent +declare void @g(i32 convergent) ; Although this loop contains a convergent instruction, it should be ; fully unrolled. @@ -25,6 +26,27 @@ ret i32 0 } +; CHECK-LABEL: @full_unroll_arg( +define i32 @full_unroll_arg() { +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK-NOT: call void @g( + call void @g(i32 %x.0) + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 3 + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + + ; This loop contains a convergent instruction, but it should be partially ; unrolled. The unroll count is the largest power of 2 that divides the ; multiple -- 4, in this case. @@ -51,6 +73,28 @@ ret i32 0 } +; CHECK-LABEL: @runtime_unroll_arg( +define i32 @runtime_unroll_arg(i32 %n) { +entry: + %loop_ctl = mul nsw i32 %n, 12 + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK-NOT: call void @g( + call void @g(i32 %x.0) + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + ; This loop contains a convergent instruction, so its partial unroll ; count must divide its trip multiple. This overrides its unroll ; pragma -- we unroll exactly 8 times, even though 16 is requested. @@ -80,4 +124,30 @@ ret i32 0 } +; CHECK-LABEL: @pragma_unroll_arg +define i32 @pragma_unroll_arg(i32 %n) { +entry: + %loop_ctl = mul nsw i32 %n, 24 + br label %l3, !llvm.loop !0 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK: call void @g( +; CHECK-NOT: call void @g( + call void @g(i32 %x.0) + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !0 + +exit: + ret i32 0 +} + !0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}} Index: test/Transforms/SimplifyCFG/convergent.ll =================================================================== --- /dev/null +++ test/Transforms/SimplifyCFG/convergent.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -simplifycfg -S | FileCheck %s + +declare i32 @f(i32 convergent %a, i32 %b) nounwind readonly + +; Cannot sink because there would clearly be divergence in the first argument. +; +define i32 @check1(i1 %cond, i32 %a, i32 %b, i32 %v) { +; CHECK-LABEL: @check1( +; CHECK-NEXT: br i1 %cond, label %if.then, label %if.else +; CHECK: if.then: +; CHECK-NEXT: [[RA:%.*]] = call i32 @f(i32 %a, i32 %v) +; CHECK-NEXT: br label %if.end +; CHECK: if.else: +; CHECK-NEXT: [[RB:%.*]] = call i32 @f(i32 %b, i32 %v) +; CHECK-NEXT: br label %if.end +; CHECK: if.end: +; CHECK-NEXT: [[OUT:%.*]] = phi i32 [ [[RA]], %if.then ], [ [[RB]], %if.else ] +; CHECK-NEXT: ret i32 [[OUT]] +; + br i1 %cond, label %if.then, label %if.else + +if.then: + %ra = call i32 @f(i32 %a, i32 %v) + br label %if.end + +if.else: + %rb = call i32 @f(i32 %b, i32 %v) + br label %if.end + +if.end: + %out = phi i32 [ %ra, %if.then ], [ %rb, %if.else ] + ret i32 %out +} + +; Cannot sink because there could be divergence in the first argument that was +; filtered out by the branch condition. +; +define i32 @check2(i1 %cond, i32 %a, i32 %b, i32 %v) { +; CHECK-LABEL: @check2( +; CHECK-NEXT: br i1 %cond, label %if.then, label %if.else +; CHECK: if.then: +; CHECK-NEXT: [[RA:%.*]] = call i32 @f(i32 %v, i32 %a) +; CHECK-NEXT: br label %if.end +; CHECK: if.else: +; CHECK-NEXT: [[RB:%.*]] = call i32 @f(i32 %v, i32 %b) +; CHECK-NEXT: br label %if.end +; CHECK: if.end: +; CHECK-NEXT: [[OUT:%.*]] = phi i32 [ [[RA]], %if.then ], [ [[RB]], %if.else ] +; CHECK-NEXT: ret i32 [[OUT]] +; + br i1 %cond, label %if.then, label %if.else + +if.then: + %ra = call i32 @f(i32 %v, i32 %a) + br label %if.end + +if.else: + %rb = call i32 @f(i32 %v, i32 %b) + br label %if.end + +if.end: + %out = phi i32 [ %ra, %if.then ], [ %rb, %if.else ] + ret i32 %out +} Index: utils/TableGen/CodeGenIntrinsics.h =================================================================== --- utils/TableGen/CodeGenIntrinsics.h +++ utils/TableGen/CodeGenIntrinsics.h @@ -108,7 +108,7 @@ /// True if the intrinsic is marked as convergent. bool isConvergent; - enum ArgAttribute { NoCapture, Returned, ReadOnly, WriteOnly, ReadNone }; + enum ArgAttribute { Convergent, NoCapture, Returned, ReadOnly, WriteOnly, ReadNone }; std::vector> ArgumentAttributes; CodeGenIntrinsic(Record *R); Index: utils/TableGen/CodeGenTarget.cpp =================================================================== --- utils/TableGen/CodeGenTarget.cpp +++ utils/TableGen/CodeGenTarget.cpp @@ -604,7 +604,10 @@ isConvergent = true; else if (Property->getName() == "IntrNoReturn") isNoReturn = true; - else if (Property->isSubClassOf("NoCapture")) { + else if (Property->isSubClassOf("Convergent")) { + unsigned ArgNo = Property->getValueAsInt("ArgNo"); + ArgumentAttributes.push_back(std::make_pair(ArgNo, Convergent)); + } else if (Property->isSubClassOf("NoCapture")) { unsigned ArgNo = Property->getValueAsInt("ArgNo"); ArgumentAttributes.push_back(std::make_pair(ArgNo, NoCapture)); } else if (Property->isSubClassOf("Returned")) { Index: utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- utils/TableGen/IntrinsicEmitter.cpp +++ utils/TableGen/IntrinsicEmitter.cpp @@ -560,6 +560,12 @@ do { switch (intrinsic.ArgumentAttributes[ai].second) { + case CodeGenIntrinsic::Convergent: + if (addComma) + OS << ","; + OS << "Attribute::Convergent"; + addComma = true; + break; case CodeGenIntrinsic::NoCapture: if (addComma) OS << ",";