This patch is a continuation of D150110. It separates the cases for
ADD and SUB into their own cases so that computeForAddSub can be
directly called and the NSW flag passed. This allows better optimization
when the NSW flag is enabled, and allows fixing up the TODO that was there
preivously in SimplifyDemandedBits.
Details
Diff Detail
- Repository
- rG LLVM Github Monorepo
Event Timeline
| llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | ||
|---|---|---|
| 3652–3658 | There are pre-defined variables to put the operand known bits in, if there's just two. | |
- Address reviewer comments
| llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | ||
|---|---|---|
| 3652–3658 | Fixed, thanks! | |
As I do not have commit access, can you please land this patch on my behalf @nikic using "Dhruv Chawla <dhruv263.dc@gmail.com>"? I guess I should apply for it sometime soon.
Heads up: this commit has triggered a crash in some of our code. We suspect, it leads to a miscompilation. We're trying to get a reduced test case, but it's taking a long time, since it relies on FDO and ThinLTO.
The following IR was reduced from https://github.com/v8/v8/blob/main/src/json/json-parser.cc (which needs a specific FDO profile to demonstrate the issue, but the reduced example doesn't):
$ cat reduced.ll
; ModuleID = '<bc file>'
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define weak_odr ptr @_ZN2v88internal10JsonParserItE14ParseJsonValueEv(ptr %0, ptr %_e.llvm.2168063466968335352, i8 %1, i32 %2) {
_q.exit:
switch i8 %1, label %.preheader193 [
i8 13, label %common.ret
i8 0, label %common.ret
i8 2, label %common.ret
i8 4, label %common.ret
i8 6, label %common.ret
i8 7, label %common.ret
i8 8, label %common.ret
i8 10, label %common.ret
i8 11, label %common.ret
i8 12, label %common.ret
i8 3, label %common.ret
i8 1, label %common.ret
]
.preheader193: ; preds = %6, %_q.exit
%3 = phi ptr [ %7, %6 ], [ null, %_q.exit ]
%4 = load i8, ptr %3, align 1
%5 = icmp eq i8 %4, 0
br i1 %5, label %6, label %_w.exit.thread137
6: ; preds = %.preheader193
%7 = getelementptr i16, ptr %3, i64 1
br label %.preheader193
_w.exit.thread137: ; preds = %.preheader193
%8 = call ptr @_ZN2v88internal11HandleScope6ExtendEPNS0_7IsolateE()
switch i32 %2, label %vector.memcheck [
i32 1, label %common.ret
i32 6, label %common.ret
i32 0, label %common.ret
i32 4, label %common.ret
]
common.ret: ; preds = %vector.body, %_w.exit.thread137, %_w.exit.thread137, %_w.exit.thread137, %_w.exit.thread137, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %_q.exit, %
_q.exit, %_q.exit, %_q.exit
ret ptr null
vector.memcheck: ; preds = %_w.exit.thread137
%9 = ptrtoint ptr %3 to i64
%10 = lshr i64 %9, 1
%11 = add i64 %10, 1
%12 = and i64 %11, 4294967295
%13 = add i64 %12, 9223372036854775807
%14 = and i64 %13, 9223372036854775807
%15 = add i64 %14, 1
%n.vec = and i64 %15, -16
br label %vector.body
vector.body: ; preds = %vector.body, %vector.memcheck
%index1 = phi i64 [ 0, %vector.memcheck ], [ %index.next, %vector.body ]
store <8 x i8> zeroinitializer, ptr %_e.llvm.2168063466968335352, align 1
store <8 x i8> zeroinitializer, ptr %0, align 1
%index.next = add i64 %index1, 1
%16 = icmp eq i64 %index1, %n.vec
br i1 %16, label %common.ret, label %vector.body
}
declare ptr @_ZN2v88internal11HandleScope6ExtendEPNS0_7IsolateE()
$ diff -u100 <(./clang-good -O2 reduced.ll -S -o -) <(./clang-bad -O2 reduced.ll -S -o -)
--- /dev/fd/63 2023-05-25 03:58:34.109902423 +0200
+++ /dev/fd/62 2023-05-25 03:58:34.109902423 +0200
@@ -1,114 +1,114 @@
.text
.file "reduced.ll"
.weak _ZN2v88internal10JsonParserItE14ParseJsonValueEv # -- Begin function _ZN2v88internal10JsonParserItE14ParseJsonValueEv
.p2align 4, 0x90
.type _ZN2v88internal10JsonParserItE14ParseJsonValueEv,@function
_ZN2v88internal10JsonParserItE14ParseJsonValueEv: # @_ZN2v88internal10JsonParserItE14ParseJsonValueEv
.cfi_startproc
# %bb.0: # %_q.exit
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %rbx
.cfi_def_cfa_offset 40
pushq %rax
.cfi_def_cfa_offset 48
.cfi_offset %rbx, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movl %ecx, %ebp
movq %rsi, %rbx
movq %rdi, %r14
cmpb $13, %dl
ja .LBB0_2
# %bb.1: # %_q.exit
movzbl %dl, %eax
movl $15839, %ecx # imm = 0x3DDF
btl %eax, %ecx
jae .LBB0_2
.LBB0_9: # %common.ret
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 40
popq %rbx
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.LBB0_2: # %.preheader193.preheader
.cfi_def_cfa_offset 48
movq $-2, %r15
.p2align 4, 0x90
.LBB0_3: # %.preheader193
# =>This Inner Loop Header: Depth=1
cmpb $0, 2(%r15)
leaq 2(%r15), %r15
je .LBB0_3
# %bb.4: # %_w.exit.thread137
callq _ZN2v88internal11HandleScope6ExtendEPNS0_7IsolateE@PLT
cmpl $6, %ebp
ja .LBB0_6
# %bb.5: # %_w.exit.thread137
movl $83, %eax
btl %ebp, %eax
jb .LBB0_9
.LBB0_6: # %vector.memcheck
shrq %r15
- leal 1(%r15), %ecx
- decq %rcx
- movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+ leal 1(%r15), %eax
+ decq %rax
+ movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
andq %rax, %rcx
incq %rcx
testq $-16, %rcx
je .LBB0_7
# %bb.10: # %vector.body.preheader
incl %r15d
- decq %r15
- andq %rax, %r15
- incq %r15
- andq $-16, %r15
+ movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+ orq %r15, %rcx
+ movabsq $-9223372032559808528, %rax # imm = 0x80000000FFFFFFF0
+ andq %rcx, %rax
.p2align 4, 0x90
.LBB0_11: # %vector.body
# =>This Inner Loop Header: Depth=1
movq $0, (%rbx)
movq $0, (%r14)
movq $0, (%rbx)
movq $0, (%r14)
movq $0, (%rbx)
movq $0, (%r14)
movq $0, (%rbx)
movq $0, (%r14)
movq $0, (%rbx)
movq $0, (%r14)
movq $0, (%rbx)
movq $0, (%r14)
movq $0, (%rbx)
movq $0, (%r14)
movq $0, (%rbx)
movq $0, (%r14)
- addq $-8, %r15
+ addq $-8, %rax
jne .LBB0_11
.LBB0_7: # %vector.body.epil.preheader
movl $1, %eax
.p2align 4, 0x90
.LBB0_8: # %vector.body.epil
# =>This Inner Loop Header: Depth=1
movq $0, (%rbx)
movq $0, (%r14)
decq %rax
jne .LBB0_8
jmp .LBB0_9
.Lfunc_end0:
.size _ZN2v88internal10JsonParserItE14ParseJsonValueEv, .Lfunc_end0-_ZN2v88internal10JsonParserItE14ParseJsonValueEv
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits
.addrsigI hope, automatic reduction retained validity of the IR, but I'm not fluent in LLVM IR and x86 assembly to verify.
@alexfh Thanks for the report, I've reverted the change for now.
This patch has exposed a pre-existing miscompile in integer legalization. The relevant pre-legalization DAG is:
t0: ch,glue = EntryToken
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t5: i64 = srl t2, Constant:i8<1>
t6: i32 = truncate t5
t8: i32 = add t6, Constant:i32<1>
t9: i63 = zero_extend t8
t11: i63 = add nsw t9, Constant:i63<-1>
t12: i64 = zero_extend t11
t13: i64 = add nuw t12, Constant:i64<1>
t25: i64 = and t13, Constant:i64<-16>
t21: i64 = or t25, Constant:i64<1>
t20: ch = CopyToReg t0, Register:i64 %2, t21The type legalized DAG is:
t0: ch,glue = EntryToken
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t5: i64 = srl t2, Constant:i8<1>
t6: i32 = truncate t5
t8: i32 = add t6, Constant:i32<1>
t27: i64 = zero_extend t8
t28: i64 = add nsw t27, Constant:i64<9223372036854775807>
t29: i64 = and t28, Constant:i64<9223372036854775807>
t13: i64 = add nuw t29, Constant:i64<1>
t25: i64 = and t13, Constant:i64<-16>
t21: i64 = or t25, Constant:i64<1>
t20: ch = CopyToReg t0, Register:i64 %2, t21Note that the nsw from the add -1 was incorrectly transferred to the add INT_MAX. Legalization of adds works on any-ext promoted operands, but then https://github.com/llvm/llvm-project/blob/89242ed6a28b8227890af7691e573bfc4dc55a91/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp#LL718C22-L718C22 will just blindly copy (!!!) all flags from the old node to the new one. It would be valid to do this for add nsw if we used sexted operands, but that's not what we do. This is embarrassingly bad.
The good news is that removing that flag copy doesn't cause catastrophic test fallout: https://gist.github.com/nikic/5f1d3b0c78f646f74c84aac68a40de76 So we probably just need to move that into some of the individual promotion methods where it is safe.
We just stumbled upon the same thing downstream and I also tracked it down to DAGTypeLegalizer::SetPromotedInteger.
I think it would be nice to at least stop copying the nsw/nuw flags in DAGTypeLegalizer::SetPromotedInteger as a first step (original intent seem to have been to copy fast-math-flags, no idea if that always is safe, maybe not?).
And then we could try to add back nsw/nuw propagation on a more fine-grained per-operation level. When for example dealing with an ADD operation we need to figure out if it is better to promote operands by sign-extend to allow nsw to remain, or if it is better to promote without sext and drop the flag. Sometimes the promotion would be by sign-extension anyway (I was looking at test/CodeGen/AArch64/arm64-vhadd.ll) and then I guess we want to keep the nsw flag in that specific case.
Has anyone already written a github issue for following up on the DAGTypeLegalizer::SetPromotedInteger bug?
Thanks for the quick reaction! I've verified that the revert actually fixes the original problem we've seen.
Looks like that line in SetPromotedInteger was added to propagate FMF flags for SETCC. I'm happy to take ownership of fixing this unless someone else is already working on it?
I am interested in working on this, but if you already have a fix going please feel free to take over :)
I have a patch that removes the code from SetPromotedInteger, preserves FMF for setcc, and preserves nuw/nsw when promoting add/sub if the promoted value happens to be sign/zero extended according to computeKnownBits/ComputeNumSignBits. This avoids all of the test changes from removing the code in SetPromotedInteger.
I'll break it into 2 parts, SetPromotedInteger and preserving FMF in one part. nuw/nsw preserving in the second part.
There are pre-defined variables to put the operand known bits in, if there's just two.