Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -1142,6 +1142,27 @@ does not alias any other memory visible within a function and that a ``swifterror`` alloca passed as an argument does not escape. +``convergent`` + In some parallel execution models, there exist operations with one or more + arguments that must be uniform across threads. Such arguments are called + ``convergent``. + + While the ``convergent`` attribute on functions can be thought of as + "whether and when each call site to this function is reached must be + uniform across multiple threads", the ``convergent`` attribute on function + arguments can be thought of as "the value of this argument at each + execution of each call site to this function must be uniform across + multiple threads". + + Specifically, two runs r1 and r2 of a program are said to be compatible + (wrt convergent function attributes) if for every call site with a + convergent argument, the sequences S1 and S2 of values supplied to the + convergent argument at that call site in r1 and r2, respectively, satisfy + that S1 is a subsequence of S2 or vice versa. + + Program transformations must ensure that every compatible pair of runs + stays compatible. + .. _gc: Garbage Collector Strategy Names Index: include/llvm/IR/Instructions.h =================================================================== --- include/llvm/IR/Instructions.h +++ include/llvm/IR/Instructions.h @@ -1775,6 +1775,11 @@ return AttributeList.hasAttrSomewhere(Attribute::ByVal); } + /// Determine if any call argument is convergent. + bool hasConvergentArgument() const { + return AttributeList.hasAttrSomewhere(Attribute::Convergent); + } + /// Return the function called, or null if this is an /// indirect function invocation. /// Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -49,6 +49,12 @@ // Throws - This intrinsic can throw. def Throws : IntrinsicProperty; +// Convergent - The specified argument is convergent. This corresponds to the +// IR function argument attribute of the same name. +class Convergent : IntrinsicProperty { + int ArgNo = argNo; +} + // NoCapture - The specified argument pointer is not captured by the intrinsic. class NoCapture : IntrinsicProperty { int ArgNo = argNo; Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -237,7 +237,7 @@ llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - [IntrReadMem]>; + [IntrReadMem, Convergent<1>]>; def int_amdgcn_image_load : AMDGPUImageLoad; def int_amdgcn_image_load_mip : AMDGPUImageLoad; @@ -253,7 +253,7 @@ llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - []>; + [Convergent<2>]>; def int_amdgcn_image_store : AMDGPUImageStore; def int_amdgcn_image_store_mip : AMDGPUImageStore; @@ -269,7 +269,7 @@ llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - [IntrReadMem]>; + [IntrReadMem, Convergent<1>, Convergent<2>]>; // Basic sample def int_amdgcn_image_sample : AMDGPUImageSample; @@ -361,7 +361,7 @@ llvm_i1_ty, // r128(imm) llvm_i1_ty, // da(imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<2>]>; def int_amdgcn_image_atomic_swap : AMDGPUImageAtomic; def int_amdgcn_image_atomic_add : AMDGPUImageAtomic; @@ -384,7 +384,7 @@ llvm_i1_ty, // r128(imm) llvm_i1_ty, // da(imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<3>]>; class AMDGPUBufferLoad : Intrinsic < [llvm_anyfloat_ty], @@ -393,7 +393,7 @@ llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - [IntrReadMem]>; + [IntrReadMem, Convergent<0>]>; def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; @@ -405,7 +405,7 @@ llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - [IntrWriteMem]>; + [IntrWriteMem, Convergent<1>]>; def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; @@ -416,7 +416,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<1>]>; def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; @@ -435,7 +435,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - []>; + [Convergent<2>]>; def int_amdgcn_buffer_wbinvl1_sc : GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, Index: lib/AsmParser/LLParser.cpp =================================================================== --- lib/AsmParser/LLParser.cpp +++ lib/AsmParser/LLParser.cpp @@ -1387,6 +1387,7 @@ continue; } case lltok::kw_byval: B.addAttribute(Attribute::ByVal); break; + case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break; case lltok::kw_dereferenceable: { uint64_t Bytes; if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes)) Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -1324,7 +1324,6 @@ I->getKindAsEnum() == Attribute::Cold || I->getKindAsEnum() == Attribute::OptimizeNone || I->getKindAsEnum() == Attribute::JumpTable || - I->getKindAsEnum() == Attribute::Convergent || I->getKindAsEnum() == Attribute::ArgMemOnly || I->getKindAsEnum() == Attribute::NoRecurse || I->getKindAsEnum() == Attribute::InaccessibleMemOnly || @@ -1335,7 +1334,8 @@ "' only applies to functions!", V); return; } - } else if (I->getKindAsEnum() == Attribute::ReadOnly || + } else if (I->getKindAsEnum() == Attribute::Convergent || + I->getKindAsEnum() == Attribute::ReadOnly || I->getKindAsEnum() == Attribute::WriteOnly || I->getKindAsEnum() == Attribute::ReadNone) { if (Idx == 0) { @@ -1362,6 +1362,7 @@ if (isReturnValue) Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) && + !Attrs.hasAttribute(Idx, Attribute::Convergent) && !Attrs.hasAttribute(Idx, Attribute::Nest) && !Attrs.hasAttribute(Idx, Attribute::StructRet) && !Attrs.hasAttribute(Idx, Attribute::NoCapture) && Index: lib/Transforms/Scalar/GVNHoist.cpp =================================================================== --- lib/Transforms/Scalar/GVNHoist.cpp +++ lib/Transforms/Scalar/GVNHoist.cpp @@ -920,7 +920,7 @@ break; } - if (Call->isConvergent()) + if (Call->isConvergent() || Call->hasConvergentArgument()) break; CI.insert(Call, VN); Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -1371,7 +1371,6 @@ return false; return true; - case Instruction::ShuffleVector: // Shufflevector masks are constant. return OpIdx != 2; @@ -1438,6 +1437,17 @@ // Don't touch any operand of token type. return false; + // We cannot sink calls if any argument is convergent: sinking may add + // divergence even when the argument is the same IR value in all blocks, + // because there may be divergence in the argument that was removed by the + // control flow conditions for those blocks. + if (isa(I0) || isa(I0)) { + ImmutableCallSite CS(I0); + if (OI < CS.getNumArgOperands() && + CS.paramHasAttr(OI + 1, Attribute::Convergent)) + return false; + } + // Because SROA can't handle speculating stores of selects, try not // to sink loads or stores of allocas when we'd have to create a PHI for // the address operand. Also, because it is likely that loads or stores Index: test/Bitcode/attributes.ll =================================================================== --- test/Bitcode/attributes.ll +++ test/Bitcode/attributes.ll @@ -334,6 +334,11 @@ ret void } +; CHECK: define void @f57(i32 convergent) +define void @f57(i32 convergent) { + ret void +} + ; CHECK: attributes #0 = { noreturn } ; CHECK: attributes #1 = { nounwind } ; CHECK: attributes #2 = { readnone } Index: test/Transforms/GVNHoist/hoist-convergent.ll =================================================================== --- test/Transforms/GVNHoist/hoist-convergent.ll +++ test/Transforms/GVNHoist/hoist-convergent.ll @@ -86,8 +86,35 @@ ret float %add } +; Cannot hoist because there could be divergence in the argument that was +; filtered out by the branch condition. +; +; CHECK-LABEL: @no_convergent_arg_hoisting( +; CHECK: if.then: +; CHECK: call float @convergent_arg_func( + +; CHECK: if.else: +; CHECK: call float @convergent_arg_func( +define float @no_convergent_arg_hoisting(i1 %cc, float %x) { +entry: + br i1 %cc, label %if.then, label %if.else + +if.then: + %r1 = call float @convergent_arg_func(float %x) + br label %if.end + +if.else: + %r2 = call float @convergent_arg_func(float %x) + br label %if.end + +if.end: + %r = phi float [ %r1, %if.then ], [ %r2, %if.else ] + ret float %r +} + declare float @convergent_func(float, float) #0 declare float @func(float, float) #1 +declare float @convergent_arg_func(float convergent) #1 attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind readnone } Index: test/Transforms/InstCombine/select-call.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/select-call.ll @@ -0,0 +1,37 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare i32 @f(i32 convergent %a, i32 %b) nounwind readonly + +; Can convert select C, call, call -> call select if the affected argument(s) +; are not convergent. +; +; TODO: actually teach foldSelectOpOp to do this +; +define i32 @check_good(i1 %cond, i32 %v) { +; CHECK-LABEL: @check_good +; CHECK: %out.v = select i1 %cond, i32 0, i32 1 +; CHECK: %out = call i32 @f(i32 %v, i32 %out.v) +; +entry: + %r0 = call i32 @f(i32 %v, i32 0) + %r1 = call i32 @f(i32 %v, i32 1) + %out = select i1 %cond, i32 %r0, i32 %r1 + ret i32 %out +} + +; Must keep both calls to f here, otherwise we would introduce sources of +; divergence to the first argument of f. +define i32 @check_bad(i1 %cond, i32 %v) { +; CHECK-LABEL: @check_bad( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R0:%.*]] = call i32 @f(i32 0, i32 %v) +; CHECK-NEXT: [[R1:%.*]] = call i32 @f(i32 1, i32 %v) +; CHECK-NEXT: [[OUT:%.*]] = select i1 %cond, i32 [[R0]], i32 [[R1]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %r0 = call i32 @f(i32 0, i32 %v) + %r1 = call i32 @f(i32 1, i32 %v) + %out = select i1 %cond, i32 %r0, i32 %r1 + ret i32 %out +} Index: test/Transforms/SimplifyCFG/convergent.ll =================================================================== --- /dev/null +++ test/Transforms/SimplifyCFG/convergent.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -simplifycfg -S | FileCheck %s + +declare i32 @f(i32 convergent %a, i32 %b) nounwind readonly + +; Cannot sink because there would clearly be divergence in the first argument. +; +define i32 @check1(i1 %cond, i32 %a, i32 %b, i32 %v) { +; CHECK-LABEL: @check1( +; CHECK-NEXT: br i1 %cond, label %if.then, label %if.else +; CHECK: if.then: +; CHECK-NEXT: [[RA:%.*]] = call i32 @f(i32 %a, i32 %v) +; CHECK-NEXT: br label %if.end +; CHECK: if.else: +; CHECK-NEXT: [[RB:%.*]] = call i32 @f(i32 %b, i32 %v) +; CHECK-NEXT: br label %if.end +; CHECK: if.end: +; CHECK-NEXT: [[OUT:%.*]] = phi i32 [ [[RA]], %if.then ], [ [[RB]], %if.else ] +; CHECK-NEXT: ret i32 [[OUT]] +; + br i1 %cond, label %if.then, label %if.else + +if.then: + %ra = call i32 @f(i32 %a, i32 %v) + br label %if.end + +if.else: + %rb = call i32 @f(i32 %b, i32 %v) + br label %if.end + +if.end: + %out = phi i32 [ %ra, %if.then ], [ %rb, %if.else ] + ret i32 %out +} + +; Cannot sink because there could be divergence in the first argument that was +; filtered out by the branch condition. +; +define i32 @check2(i1 %cond, i32 %a, i32 %b, i32 %v) { +; CHECK-LABEL: @check2( +; CHECK-NEXT: br i1 %cond, label %if.then, label %if.else +; CHECK: if.then: +; CHECK-NEXT: [[RA:%.*]] = call i32 @f(i32 %v, i32 %a) +; CHECK-NEXT: br label %if.end +; CHECK: if.else: +; CHECK-NEXT: [[RB:%.*]] = call i32 @f(i32 %v, i32 %b) +; CHECK-NEXT: br label %if.end +; CHECK: if.end: +; CHECK-NEXT: [[OUT:%.*]] = phi i32 [ [[RA]], %if.then ], [ [[RB]], %if.else ] +; CHECK-NEXT: ret i32 [[OUT]] +; + br i1 %cond, label %if.then, label %if.else + +if.then: + %ra = call i32 @f(i32 %v, i32 %a) + br label %if.end + +if.else: + %rb = call i32 @f(i32 %v, i32 %b) + br label %if.end + +if.end: + %out = phi i32 [ %ra, %if.then ], [ %rb, %if.else ] + ret i32 %out +} Index: utils/TableGen/CodeGenIntrinsics.h =================================================================== --- utils/TableGen/CodeGenIntrinsics.h +++ utils/TableGen/CodeGenIntrinsics.h @@ -108,7 +108,7 @@ /// True if the intrinsic is marked as convergent. bool isConvergent; - enum ArgAttribute { NoCapture, Returned, ReadOnly, WriteOnly, ReadNone }; + enum ArgAttribute { Convergent, NoCapture, Returned, ReadOnly, WriteOnly, ReadNone }; std::vector> ArgumentAttributes; CodeGenIntrinsic(Record *R); Index: utils/TableGen/CodeGenTarget.cpp =================================================================== --- utils/TableGen/CodeGenTarget.cpp +++ utils/TableGen/CodeGenTarget.cpp @@ -604,7 +604,10 @@ isConvergent = true; else if (Property->getName() == "IntrNoReturn") isNoReturn = true; - else if (Property->isSubClassOf("NoCapture")) { + else if (Property->isSubClassOf("Convergent")) { + unsigned ArgNo = Property->getValueAsInt("ArgNo"); + ArgumentAttributes.push_back(std::make_pair(ArgNo, Convergent)); + } else if (Property->isSubClassOf("NoCapture")) { unsigned ArgNo = Property->getValueAsInt("ArgNo"); ArgumentAttributes.push_back(std::make_pair(ArgNo, NoCapture)); } else if (Property->isSubClassOf("Returned")) { Index: utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- utils/TableGen/IntrinsicEmitter.cpp +++ utils/TableGen/IntrinsicEmitter.cpp @@ -560,6 +560,12 @@ do { switch (intrinsic.ArgumentAttributes[ai].second) { + case CodeGenIntrinsic::Convergent: + if (addComma) + OS << ","; + OS << "Attribute::Convergent"; + addComma = true; + break; case CodeGenIntrinsic::NoCapture: if (addComma) OS << ",";