https://reviews.llvm.org/D140798 introduces a regression in AMDGPU backend.
In case zero check is folded to llvm.usub.sat before inline, it covers opportunities for other inst-combines that may have higher precedence.
As a result, we get suboptimal code.
Details
Diff Detail
Event Timeline
Example:
define i32 @zot(i32 noundef %arg) { bb: %inst = icmp eq i32 %arg, 0 %inst1 = call i32 @llvm.cttz.i32(i32 %arg, i1 true) %inst2 = select i1 %inst, i32 -1, i32 %inst1 %inst3 = add nsw i32 %inst2, 1 ret i32 %inst3 } define i32 @snork(i32 %arg, i32 noundef %arg1) { bb: %inst = xor i32 %arg1, -1 %inst2 = call i32 @zot(i32 noundef %inst) %inst3 = icmp ne i32 %inst2, 0 %inst4 = sub i32 %inst2, 1 %inst5 = select i1 %inst3, i32 %inst4, i32 0 ret i32 %inst5 }
opt -S -passes=instcombine test.ll
define i32 @zot(i32 noundef %arg) { bb: %inst = icmp eq i32 %arg, 0 %inst1 = call i32 @llvm.cttz.i32(i32 %arg, i1 true), !range !0 %inst1.op = add nuw nsw i32 %inst1, 1 %inst3 = select i1 %inst, i32 0, i32 %inst1.op ret i32 %inst3 } define i32 @snork(i32 %arg, i32 noundef %arg1) { bb: %inst = xor i32 %arg1, -1 %inst2 = call i32 @zot(i32 noundef %inst) %inst5 = call i32 @llvm.usub.sat.i32(i32 %inst2, i32 1) ret i32 %inst5 }
"opt -S -passes=module-inline" gets
define i32 @snork(i32 %arg, i32 noundef %arg1) { bb: %inst = xor i32 %arg1, -1 %inst.i = icmp eq i32 %inst, 0 %inst1.i = call i32 @llvm.cttz.i32(i32 %inst, i1 true), !range !0 %inst1.op.i = add nuw nsw i32 %inst1.i, 1 %inst3.i = select i1 %inst.i, i32 0, i32 %inst1.op.i %inst5 = call i32 @llvm.usub.sat.i32(i32 %inst3.i, i32 1) ret i32 %inst5 }
the next inst-combine cannot do much
define i32 @snork(i32 %arg, i32 noundef %arg1) { bb: %inst = xor i32 %arg1, -1 %inst.i = icmp eq i32 %arg1, -1 %inst1.i = call i32 @llvm.cttz.i32(i32 %inst, i1 true), !range !0 %inst1.op.i = add nuw nsw i32 %inst1.i, 1 %inst3.i = select i1 %inst.i, i32 0, i32 %inst1.op.i %inst5 = call i32 @llvm.usub.sat.i32(i32 %inst3.i, i32 1) ret i32 %inst5 }
The result before the change looks better:
define i32 @snork(i32 %arg, i32 noundef %arg1) { bb: %inst = xor i32 %arg1, -1 %inst.i = icmp eq i32 %arg1, -1 %inst1.i = call i32 @llvm.cttz.i32(i32 %inst, i1 true), !range !0 %inst5 = select i1 %inst.i, i32 0, i32 %inst1.i ret i32 %inst5 }
Furthermore, we could see yet a worse difference if we track it down to the next inline pass which inlines ctlz stuff...
My idea is just to postpone the zero check folding before the compare argument, which is a call, is inlined.
If it is, the next inst-combine invocation will have a chance to do a better job, if not - it will do the same as before.
As far as I can tell, this is just a matter of missing transforms for usub_sat. I've applied https://github.com/llvm/llvm-project/commit/6261adfa51f2d93c8258e7b2d9811d607f3e50a5 and https://github.com/llvm/llvm-project/commit/3f53a58597bc84418f8b83934ef2f5c5d615626b to fix the issue from your example. Let me know if you see further issues.
@nikic, thanks a lot, I have cherry-picked that commits, which helps.
I suspected we were "lucky" enough to hit the middle of an actively developed feature.