This patch is a continuation of D150110. It separates the cases for
ADD and SUB into their own cases so that computeForAddSub can be
directly called and the NSW flag passed. This allows better optimization
when the NSW flag is enabled, and allows fixing up the TODO that was there
preivously in SimplifyDemandedBits.
Details
Diff Detail
- Repository
- rG LLVM Github Monorepo
Event Timeline
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | ||
---|---|---|
3652–3658 | There are pre-defined variables to put the operand known bits in, if there's just two. |
- Address reviewer comments
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | ||
---|---|---|
3652–3658 | Fixed, thanks! |
As I do not have commit access, can you please land this patch on my behalf @nikic using "Dhruv Chawla <dhruv263.dc@gmail.com>"? I guess I should apply for it sometime soon.
Heads up: this commit has triggered a crash in some of our code. We suspect, it leads to a miscompilation. We're trying to get a reduced test case, but it's taking a long time, since it relies on FDO and ThinLTO.
The following IR was reduced from https://github.com/v8/v8/blob/main/src/json/json-parser.cc (which needs a specific FDO profile to demonstrate the issue, but the reduced example doesn't):
$ cat reduced.ll ; ModuleID = '<bc file>' target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define weak_odr ptr @_ZN2v88internal10JsonParserItE14ParseJsonValueEv(ptr %0, ptr %_e.llvm.2168063466968335352, i8 %1, i32 %2) { _q.exit: switch i8 %1, label %.preheader193 [ i8 13, label %common.ret i8 0, label %common.ret i8 2, label %common.ret i8 4, label %common.ret i8 6, label %common.ret i8 7, label %common.ret i8 8, label %common.ret i8 10, label %common.ret i8 11, label %common.ret i8 12, label %common.ret i8 3, label %common.ret i8 1, label %common.ret ] .preheader193: ; preds = %6, %_q.exit %3 = phi ptr [ %7, %6 ], [ null, %_q.exit ] %4 = load i8, ptr %3, align 1 %5 = icmp eq i8 %4, 0 br i1 %5, label %6, label %_w.exit.thread137 6: ; preds = %.preheader193 %7 = getelementptr i16, ptr %3, i64 1 br label %.preheader193 _w.exit.thread137: ; preds = %.preheader193 %8 = call ptr @_ZN2v88internal11HandleScope6ExtendEPNS0_7IsolateE() switch i32 %2, label %vector.memcheck [ i32 1, label %common.ret i32 6, label %common.ret i32 0, label %common.ret i32 4, label %common.ret ] common.ret: ; preds = %vector.body, %_w.exit.thread137, %_w.exit.thread137, %_w.exit.thread137, %_w.exit.thread137, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, % _q.exit, %_q.exit, %_q.exit ret ptr null vector.memcheck: ; preds = %_w.exit.thread137 %9 = ptrtoint ptr %3 to i64 %10 = lshr i64 %9, 1 %11 = add i64 %10, 1 %12 = and i64 %11, 4294967295 %13 = add i64 %12, 9223372036854775807 %14 = and i64 %13, 9223372036854775807 %15 = add i64 %14, 1 %n.vec = and i64 %15, -16 br label %vector.body vector.body: ; preds = %vector.body, %vector.memcheck %index1 = phi i64 [ 0, %vector.memcheck ], [ %index.next, %vector.body ] store <8 x i8> zeroinitializer, ptr %_e.llvm.2168063466968335352, align 1 store <8 x i8> zeroinitializer, ptr %0, align 1 %index.next = add i64 %index1, 1 %16 = icmp eq i64 %index1, %n.vec br i1 %16, label %common.ret, label %vector.body } declare ptr @_ZN2v88internal11HandleScope6ExtendEPNS0_7IsolateE() $ diff -u100 <(./clang-good -O2 reduced.ll -S -o -) <(./clang-bad -O2 reduced.ll -S -o -) --- /dev/fd/63 2023-05-25 03:58:34.109902423 +0200 +++ /dev/fd/62 2023-05-25 03:58:34.109902423 +0200 @@ -1,114 +1,114 @@ .text .file "reduced.ll" .weak _ZN2v88internal10JsonParserItE14ParseJsonValueEv # -- Begin function _ZN2v88internal10JsonParserItE14ParseJsonValueEv .p2align 4, 0x90 .type _ZN2v88internal10JsonParserItE14ParseJsonValueEv,@function _ZN2v88internal10JsonParserItE14ParseJsonValueEv: # @_ZN2v88internal10JsonParserItE14ParseJsonValueEv .cfi_startproc # %bb.0: # %_q.exit pushq %rbp .cfi_def_cfa_offset 16 pushq %r15 .cfi_def_cfa_offset 24 pushq %r14 .cfi_def_cfa_offset 32 pushq %rbx .cfi_def_cfa_offset 40 pushq %rax .cfi_def_cfa_offset 48 .cfi_offset %rbx, -40 .cfi_offset %r14, -32 .cfi_offset %r15, -24 .cfi_offset %rbp, -16 movl %ecx, %ebp movq %rsi, %rbx movq %rdi, %r14 cmpb $13, %dl ja .LBB0_2 # %bb.1: # %_q.exit movzbl %dl, %eax movl $15839, %ecx # imm = 0x3DDF btl %eax, %ecx jae .LBB0_2 .LBB0_9: # %common.ret xorl %eax, %eax addq $8, %rsp .cfi_def_cfa_offset 40 popq %rbx .cfi_def_cfa_offset 32 popq %r14 .cfi_def_cfa_offset 24 popq %r15 .cfi_def_cfa_offset 16 popq %rbp .cfi_def_cfa_offset 8 retq .LBB0_2: # %.preheader193.preheader .cfi_def_cfa_offset 48 movq $-2, %r15 .p2align 4, 0x90 .LBB0_3: # %.preheader193 # =>This Inner Loop Header: Depth=1 cmpb $0, 2(%r15) leaq 2(%r15), %r15 je .LBB0_3 # %bb.4: # %_w.exit.thread137 callq _ZN2v88internal11HandleScope6ExtendEPNS0_7IsolateE@PLT cmpl $6, %ebp ja .LBB0_6 # %bb.5: # %_w.exit.thread137 movl $83, %eax btl %ebp, %eax jb .LBB0_9 .LBB0_6: # %vector.memcheck shrq %r15 - leal 1(%r15), %ecx - decq %rcx - movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF + leal 1(%r15), %eax + decq %rax + movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF andq %rax, %rcx incq %rcx testq $-16, %rcx je .LBB0_7 # %bb.10: # %vector.body.preheader incl %r15d - decq %r15 - andq %rax, %r15 - incq %r15 - andq $-16, %r15 + movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 + orq %r15, %rcx + movabsq $-9223372032559808528, %rax # imm = 0x80000000FFFFFFF0 + andq %rcx, %rax .p2align 4, 0x90 .LBB0_11: # %vector.body # =>This Inner Loop Header: Depth=1 movq $0, (%rbx) movq $0, (%r14) movq $0, (%rbx) movq $0, (%r14) movq $0, (%rbx) movq $0, (%r14) movq $0, (%rbx) movq $0, (%r14) movq $0, (%rbx) movq $0, (%r14) movq $0, (%rbx) movq $0, (%r14) movq $0, (%rbx) movq $0, (%r14) movq $0, (%rbx) movq $0, (%r14) - addq $-8, %r15 + addq $-8, %rax jne .LBB0_11 .LBB0_7: # %vector.body.epil.preheader movl $1, %eax .p2align 4, 0x90 .LBB0_8: # %vector.body.epil # =>This Inner Loop Header: Depth=1 movq $0, (%rbx) movq $0, (%r14) decq %rax jne .LBB0_8 jmp .LBB0_9 .Lfunc_end0: .size _ZN2v88internal10JsonParserItE14ParseJsonValueEv, .Lfunc_end0-_ZN2v88internal10JsonParserItE14ParseJsonValueEv .cfi_endproc # -- End function .section ".note.GNU-stack","",@progbits .addrsig
I hope, automatic reduction retained validity of the IR, but I'm not fluent in LLVM IR and x86 assembly to verify.
@alexfh Thanks for the report, I've reverted the change for now.
This patch has exposed a pre-existing miscompile in integer legalization. The relevant pre-legalization DAG is:
t0: ch,glue = EntryToken t2: i64,ch = CopyFromReg t0, Register:i64 %0 t5: i64 = srl t2, Constant:i8<1> t6: i32 = truncate t5 t8: i32 = add t6, Constant:i32<1> t9: i63 = zero_extend t8 t11: i63 = add nsw t9, Constant:i63<-1> t12: i64 = zero_extend t11 t13: i64 = add nuw t12, Constant:i64<1> t25: i64 = and t13, Constant:i64<-16> t21: i64 = or t25, Constant:i64<1> t20: ch = CopyToReg t0, Register:i64 %2, t21
The type legalized DAG is:
t0: ch,glue = EntryToken t2: i64,ch = CopyFromReg t0, Register:i64 %0 t5: i64 = srl t2, Constant:i8<1> t6: i32 = truncate t5 t8: i32 = add t6, Constant:i32<1> t27: i64 = zero_extend t8 t28: i64 = add nsw t27, Constant:i64<9223372036854775807> t29: i64 = and t28, Constant:i64<9223372036854775807> t13: i64 = add nuw t29, Constant:i64<1> t25: i64 = and t13, Constant:i64<-16> t21: i64 = or t25, Constant:i64<1> t20: ch = CopyToReg t0, Register:i64 %2, t21
Note that the nsw from the add -1 was incorrectly transferred to the add INT_MAX. Legalization of adds works on any-ext promoted operands, but then https://github.com/llvm/llvm-project/blob/89242ed6a28b8227890af7691e573bfc4dc55a91/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp#LL718C22-L718C22 will just blindly copy (!!!) all flags from the old node to the new one. It would be valid to do this for add nsw if we used sexted operands, but that's not what we do. This is embarrassingly bad.
The good news is that removing that flag copy doesn't cause catastrophic test fallout: https://gist.github.com/nikic/5f1d3b0c78f646f74c84aac68a40de76 So we probably just need to move that into some of the individual promotion methods where it is safe.
We just stumbled upon the same thing downstream and I also tracked it down to DAGTypeLegalizer::SetPromotedInteger.
I think it would be nice to at least stop copying the nsw/nuw flags in DAGTypeLegalizer::SetPromotedInteger as a first step (original intent seem to have been to copy fast-math-flags, no idea if that always is safe, maybe not?).
And then we could try to add back nsw/nuw propagation on a more fine-grained per-operation level. When for example dealing with an ADD operation we need to figure out if it is better to promote operands by sign-extend to allow nsw to remain, or if it is better to promote without sext and drop the flag. Sometimes the promotion would be by sign-extension anyway (I was looking at test/CodeGen/AArch64/arm64-vhadd.ll) and then I guess we want to keep the nsw flag in that specific case.
Has anyone already written a github issue for following up on the DAGTypeLegalizer::SetPromotedInteger bug?
Thanks for the quick reaction! I've verified that the revert actually fixes the original problem we've seen.
Looks like that line in SetPromotedInteger was added to propagate FMF flags for SETCC. I'm happy to take ownership of fixing this unless someone else is already working on it?
I am interested in working on this, but if you already have a fix going please feel free to take over :)
I have a patch that removes the code from SetPromotedInteger, preserves FMF for setcc, and preserves nuw/nsw when promoting add/sub if the promoted value happens to be sign/zero extended according to computeKnownBits/ComputeNumSignBits. This avoids all of the test changes from removing the code in SetPromotedInteger.
I'll break it into 2 parts, SetPromotedInteger and preserving FMF in one part. nuw/nsw preserving in the second part.
There are pre-defined variables to put the operand known bits in, if there's just two.