If a shl node leaves the upper half bits zero / undemanded, then see if we can profitably perform this with a half-width shl and a free trunc/zext.
Followup to D146121
Paths
| Differential D155472
[DAG] Attempt shl narrowing in SimplifyDemandedBits ClosedPublic Authored by RKSimon on Jul 17 2023, 8:27 AM.
Details Summary If a shl node leaves the upper half bits zero / undemanded, then see if we can profitably perform this with a half-width shl and a free trunc/zext. Followup to D146121
Diff Detail
Event Timeline
Comment Actions @pengfei, @Peter told me about how these test cases can potentially be fuzzed to check for equivalence, and I've made a little script to compare generated X86 assemblies of simple function pairs (e.g. those without ptr parameters) by randomizing initial register values, emulating each function pair separately, and comparing final register values. Each function pair is emulated 1000 times using Unicorn Engine, although any difference in register value usually comes up in the first iteration. No function pair I've tested so far has presented a discrepancy in return value (%rax), though some do present a difference in caller-saved GPR contents (%rdi, %rcx, %rsi, %r8). The list of functions that I were able to verify is as follows: llvm/test/CodeGen/X86/bswap.ll
llvm/test/CodeGen/X86/cmp-concat.ll
llvm/test/CodeGen/X86/combine-bitreverse.ll
llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
llvm/test/CodeGen/X86/dagcombine-shifts.ll
llvm/test/CodeGen/X86/divmod128.ll
llvm/test/CodeGen/X86/extract-bits.ll
llvm/test/CodeGen/X86/lea-dagdag.ll
llvm/test/CodeGen/X86/lea-opt2.ll
llvm/test/CodeGen/X86/parity.ll
llvm/test/CodeGen/X86/select.ll
llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
llvm/test/CodeGen/X86/setcc.ll
llvm/test/CodeGen/X86/shift-combine.ll
llvm/test/CodeGen/X86/shift-pair.ll
llvm/test/CodeGen/X86/zext-shl.ll
Comment Actions @oakrc Nice! By caller-saved GPR contents do you mean intermediate register values? That should be expected as SimplifyDemandedBits can lead to undemanded bits from each register value changing. Comment Actions
Yeah, the discrepancy just shows that the script works, and it doesn't say there's anything wrong with the lowering code. RKSimon retitled this revision from [DAG] Attempt shl narrowing in SimplifyDemandedBits (WIP) to [DAG] Attempt shl narrowing in SimplifyDemandedBits. Comment ActionsReady for review Comment Actions FYI, after following patch, several files are not compilable with x86 clang because of out of memory after 1 hours infinity compilation. commit a8cef6b58e2d41f04ed4fa63c3f628eac1a28925 Author: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Mon Sep 11 10:17:28 2023 +0100 Check https://lab.llvm.org/buildbot/#/builders/91/builds/18672 for more details. Our buildbot has following ongoing compilations. $ ps axww|grep clang 14038 ? R 30:14 /scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/./bin/clang++ --target=x86_64-unknown-linux-gnu -DDEBUG_PREFIX="PluginInterface" -DGTEST_HAS_RTTI=0 -DLIBOMPTARGET_JIT_VE -DLIBOMPTARGET_JIT_X86 -DTARGET_NAME="PluginInterface" -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/llvm/include -I/scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/include -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/include -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/plugins-nextgen/common/elf_common -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/plugins-nextgen/common/MemoryManager -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Wall -Wcast-qual -Wformat-pedantic -Wimplicit-fallthrough -Wsign-compare -Wno-enum-constexpr-conversion -Wno-extra -Wno-pedantic -O2 -g -DNDEBUG -fPIC -fvisibility=protected -fno-exceptions -funwind-tables -fno-rtti -std=c++17 -MD -MT openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeFiles/PluginInterface.dir/PluginInterface.cpp.o -MF openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeFiles/PluginInterface.dir/PluginInterface.cpp.o.d -o openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeFiles/PluginInterface.dir/PluginInterface.cpp.o -c /scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp 14177 ? R 30:12 /scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/./bin/clang++ --target=x86_64-unknown-linux-gnu -DGTEST_HAS_RTTI=0 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/llvm/include -I/scratch/buildbot/bothome clang-ve-ninja/build/build_llvm/include -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/include -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Wall -Wcast-qual -Wformat-pedantic -Wimplicit-fallthrough -Wsign-compare -Wno-enum-constexpr-conversion -Wno-extra -Wno-pedantic -O2 -g -DNDEBUG -fPIC -fno-exceptions -funwind-tables -fno-rtti -std=c++17 -MD -MT openmp/libomptarget/src/CMakeFiles/omptarget.dir/device.cpp.o -MF openmp/libomptarget/src/CMakeFiles/omptarget.dir/device.cpp.o.d -o openmp/libomptarget/src/CMakeFiles/omptarget.dir/device.cpp.o -c /scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/src/device.cpp 14186 ? R 26:02 /scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/./bin/clang++ --target=x86_64-unknown-linux-gnu -DGTEST_HAS_RTTI=0 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/llvm/include -I/scratch/buildbot/bothome clang-ve-ninja/build/build_llvm/include -I/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/include -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Wall -Wcast-qual -Wformat-pedantic -Wimplicit-fallthrough -Wsign-compare -Wno-enum-constexpr-conversion -Wno-extra -Wno-pedantic -O2 -g -DNDEBUG -fPIC -fno-exceptions -funwind-tables -fno-rtti -std=c++17 -MD -MT openmp/libomptarget/src/CMakeFiles/omptarget.dir/rtl.cpp.o -MF openmp/libomptarget/src/CMakeFiles/omptarget.dir/rtl.cpp.o.d -o openmp/libomptarget/src/CMakeFiles/omptarget.dir/rtl.cpp.o -c /scratch/buildbot/bothome/clang-ve-ninja/llvm-project/openmp/libomptarget/src/rtl.cpp Comment Actions
Thanks. And thank you for your efforts.
Comment Actions Still a few more regressions to address - I'll be back :)
Comment Actions rebase - the only remaining codegen diff I'm not happy with is the or-address.ll change - we have different oneuse handling for SHL and ZEXT(SHL) cases which is going to take a bit longer to cleanup Comment Actions Can the changes to X86ISelDAGToDAG.cpp and X86ISelLowering.cpp that are there to handle the regressions be split to a prior patch?
Comment Actions address feedback and rebase - I'm still investigating how to handle more of the X86-specific changes in pre-commits Comment Actions
Do they cause regressions if committed independently? Comment Actions The actual shift truncation code looks good. Would prefer if the other changes where an independent commit, but if thats not feasible that shouldn't be a blocker. Comment Actions
They currently cause no test changes against trunk - I need to create a few tests that I'm happy with and that should cover most of the X86 changes so I will be able to pre-commit those. The DAGCombiner.cpp change might have to stay though. This revision is now accepted and ready to land.Oct 3 2023, 12:12 PM Closed by commit rG7a8c04ef84ec: [DAG] Attempt shl narrowing in SimplifyDemandedBits (authored by RKSimon). · Explain WhyOct 4 2023, 2:23 AM This revision was automatically updated to reflect the committed changes.
Comment Actions this is causing check-clang failures in a x86-64 stage 2 build, e.g. clang/test/Parser/cxx2b-lambdas.cpp. it showed up on a bot and I managed to repro locally, so it's likely reproducible Comment Actions a lot of the crashes seem to be on this line, perhaps that code has UB? could we revert this while investigating? kstoimenov added a reverting change: rG0a776996af69: Revert "[DAG] Attempt shl narrowing in SimplifyDemandedBits".Oct 4 2023, 3:16 PM Comment Actions there is a case that has peformance regression before this patch, it can easily find t116 as index: >t56: f64,ch = VMOVSDZrm_alt<Mem:(load (s64) from %ir.34, !tbaa !37)> t16, TargetConstant:i8<8>, t116, TargetConstant:i32<16>, Register:i16 $noreg, t0 after this patch, ISEL can only find t127 as index instead of t125 since ISEL is not sure the high 32bits of t125 is zero. Thus, extra instruction are created. t95: i64,ch = load<(load (s16) from %ir.5, !tbaa !33), zext from i16> t0, t6, undef:i64 >t56: f64,ch = VMOVSDZrm_alt<Mem:(load (s64) from %ir.34, !tbaa !37)> t16, TargetConstant:i8<8>, t140, TargetConstant:i32<16>, Register:i16 $noreg, t0 i guess writing td's pattern is not quite flexible and it cannot use SImplifyDemandedBits. @RKSimon do you have any idea to handle such case? Comment Actions @yubing Try stepping through X86DAGToDAGISel::matchAddressRecursively - its likely we're missing a case now that a zext has been inserted someplace.
Revision Contents
Diff 551102 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
llvm/test/CodeGen/AMDGPU/idiv-licm.ll
llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
llvm/test/CodeGen/X86/avx512vnni-combine.ll
llvm/test/CodeGen/X86/avxvnni-combine.ll
llvm/test/CodeGen/X86/bswap.ll
llvm/test/CodeGen/X86/buildvec-insertvec.ll
llvm/test/CodeGen/X86/cmp-concat.ll
llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
llvm/test/CodeGen/X86/combine-bitreverse.ll
llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
llvm/test/CodeGen/X86/dagcombine-shifts.ll
llvm/test/CodeGen/X86/divmod128.ll
llvm/test/CodeGen/X86/extract-bits.ll
llvm/test/CodeGen/X86/fp128-i128.ll
llvm/test/CodeGen/X86/lea-dagdag.ll
llvm/test/CodeGen/X86/lea-opt2.ll
llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
llvm/test/CodeGen/X86/masked_compressstore.ll
llvm/test/CodeGen/X86/masked_expandload.ll
llvm/test/CodeGen/X86/or-address.ll
llvm/test/CodeGen/X86/parity.ll
llvm/test/CodeGen/X86/pr22970.ll
llvm/test/CodeGen/X86/pr38217.ll
llvm/test/CodeGen/X86/pr62653.ll
llvm/test/CodeGen/X86/select.ll
llvm/test/CodeGen/X86/select_const.ll
llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
llvm/test/CodeGen/X86/setcc.ll
llvm/test/CodeGen/X86/shift-combine.ll
llvm/test/CodeGen/X86/shift-pair.ll
llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
llvm/test/CodeGen/X86/vselect.ll
llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
llvm/test/CodeGen/X86/zext-shl.ll
|
I'd argue the MakeValueIsZero check should be after all the other checks (including the profitability ones) as the recursion can be expensive. Although its probably not a huge deal either way.