diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -192,12 +192,13 @@ NewRoots.clear(); for (auto &F : M.functions()) { - if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F)) + if (F.isDeclaration()) continue; const FeatureBitset &CalleeBits = TM->getSubtargetImpl(F)->getFeatureBits(); SmallVector, 32> ToReplace; + SmallSet Visited; for (User *U : F.users()) { Instruction *I = dyn_cast(U); @@ -207,16 +208,17 @@ if (!CI) continue; Function *Caller = CI->getCaller(); - if (!Caller) + if (!Caller || !Visited.insert(CI).second) continue; - if (!Roots.count(Caller)) + if (!Roots.count(Caller) && !NewRoots.count(Caller)) continue; const FeatureBitset &CallerBits = TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures; if (CallerBits == (CalleeBits & TargetFeatures)) { - NewRoots.insert(&F); + if (!Roots.count(&F)) + NewRoots.insert(&F); continue; } @@ -258,6 +260,9 @@ F->eraseFromParent(); } + Roots.clear(); + Clones.clear(); + return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll --- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll @@ -7,17 +7,54 @@ ; OPT-EXT: define void @foo3() local_unnamed_addr #1 ; OPT-INT: define internal fastcc void @foo3.2() unnamed_addr #1 ; OPT-EXT: define void @foo2() local_unnamed_addr #1 -; OPT-INT: define internal fastcc void @foo2() unnamed_addr #1 +; OPT-INT: define internal fastcc void @foo2.3() unnamed_addr #1 ; OPT-EXT: define void @foo1() local_unnamed_addr #1 +; OPT-EXT: tail call void @foo4() +; OPT-EXT: tail call void @foo3() +; OPT-EXT: tail call void @foo2() +; OPT-EXT: tail call void @foo2() +; OPT-EXT: tail call void @foo1() +; OPT-EXT: tail call fastcc void @0() ; OPT-INT: define internal fastcc void @foo1.1() unnamed_addr #1 +; OPT-INT: tail call void @foo4() +; OPT-INT: tail call fastcc void @foo3.2() +; OPT-INT: tail call fastcc void @foo2.3() +; OPT-INT: tail call fastcc void @foo2.3() +; OPT-INT: tail call fastcc void @foo1.1() +; OPT-INT: tail call fastcc void @0() +; OPT: ret void ; OPT: define amdgpu_kernel void @kernel1() local_unnamed_addr #2 +; OPT-EXT: tail call fastcc void @foo1.1() +; OPT-INT: tail call fastcc void @foo1() +; OPT: ret void ; OPT: define amdgpu_kernel void @kernel2() local_unnamed_addr #3 +; OPT-EXT: tail call void @foo2() +; OPT-INT: tail call fastcc void @foo2.3() +; OPT: ret void ; OPT: define amdgpu_kernel void @kernel3() local_unnamed_addr #3 +; OPT-EXT: tail call void @foo1() +; OPT-INT: tail call fastcc void @foo1.1() +; OPT: ret void ; OPT-EXT: define internal fastcc void @foo1.1() unnamed_addr #4 +; OPT-EXT: tail call void @foo4() +; OPT-EXT: tail call fastcc void @foo3.2() +; OPT-EXT: tail call fastcc void @foo2.3() +; OPT-EXT: tail call fastcc void @foo2.3() +; OPT-EXT: tail call fastcc void @foo1.1() +; OPT-EXT: tail call fastcc void @1() ; OPT-INT: define internal fastcc void @foo1() unnamed_addr #4 +; OPT-INT: tail call void @foo4() +; OPT-INT: tail call fastcc void @foo3() +; OPT-INT: tail call fastcc void @foo2() +; OPT-INT: tail call fastcc void @foo2() +; OPT-INT: tail call fastcc void @foo1() +; OPT-INT: tail call fastcc void @1() +; OPT: ret void ; OPT: define internal fastcc void @1() unnamed_addr #4 ; OPT-EXT: define internal fastcc void @foo3.2() unnamed_addr #4 ; OPT-INT: define internal fastcc void @foo3() unnamed_addr #4 +; OPT-EXT: define internal fastcc void @foo2.3() unnamed_addr #4 +; OPT-INT: define internal fastcc void @foo2() unnamed_addr #4 ; OPT: attributes #0 = { {{.*}} "target-features"="+wavefrontsize64" } ; OPT: attributes #1 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,-wavefrontsize32,+wavefrontsize64{{.*}}" } ; OPT: attributes #2 = { {{.*}} "target-features"="+wavefrontsize32" }