diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7716,6 +7716,18 @@ if (Len <= 8) return Op; + // Avoid the multiply if we only have 2 bytes to add. + // TODO: Only doing this for scalars because vectors weren't as obviously + // improved. + if (Len == 16 && !VT.isVector()) { + // v = (v + (v >> 8)) & 0x00FF; + return DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::ADD, dl, VT, Op, + DAG.getNode(ISD::SRL, dl, VT, Op, + DAG.getConstant(8, dl, ShVT))), + DAG.getConstant(0xFF, dl, VT)); + } + // v = (v * 0x01010101...) >> (Len - 8) SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT); diff --git a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll --- a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll +++ b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll @@ -23,9 +23,9 @@ ; SLOW-NEXT: add 3, 4, 3 ; SLOW-NEXT: srwi 4, 3, 4 ; SLOW-NEXT: add 3, 3, 4 -; SLOW-NEXT: andi. 3, 3, 3855 -; SLOW-NEXT: mulli 3, 3, 257 -; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31 +; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31 +; SLOW-NEXT: clrlwi 3, 3, 28 +; SLOW-NEXT: add 3, 3, 4 ; SLOW-NEXT: blr %z = zext i8 %x to i16 %pop = tail call i16 @llvm.ctpop.i16(i16 %z) @@ -172,9 +172,10 @@ ; SLOW-NEXT: add 3, 4, 3 ; SLOW-NEXT: srwi 4, 3, 4 ; SLOW-NEXT: add 3, 3, 4 -; SLOW-NEXT: andi. 3, 3, 3855 -; SLOW-NEXT: mulli 3, 3, 257 -; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31 +; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31 +; SLOW-NEXT: clrlwi 3, 3, 28 +; SLOW-NEXT: add 3, 3, 4 +; SLOW-NEXT: clrldi 3, 3, 32 ; SLOW-NEXT: blr %pop = tail call i16 @llvm.ctpop.i16(i16 %x) %z = zext i16 %pop to i32 @@ -276,9 +277,9 @@ ; SLOW-NEXT: add 3, 4, 3 ; SLOW-NEXT: srwi 4, 3, 4 ; SLOW-NEXT: add 3, 3, 4 -; SLOW-NEXT: andi. 3, 3, 3855 -; SLOW-NEXT: mulli 3, 3, 257 -; SLOW-NEXT: srwi 3, 3, 8 +; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31 +; SLOW-NEXT: clrlwi 3, 3, 28 +; SLOW-NEXT: add 3, 3, 4 ; SLOW-NEXT: rlwinm 3, 3, 0, 27, 27 ; SLOW-NEXT: blr %pop = call i16 @llvm.ctpop.i16(i16 %x) diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -110,13 +110,10 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 1 -; RV32_NOZBB-NEXT: addi a1, a1, -241 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 8 +; RV32_NOZBB-NEXT: andi a1, a0, 15 +; RV32_NOZBB-NEXT: slli a0, a0, 20 +; RV32_NOZBB-NEXT: srli a0, a0, 28 ; RV32_NOZBB-NEXT: add a0, a1, a0 -; RV32_NOZBB-NEXT: slli a0, a0, 19 -; RV32_NOZBB-NEXT: srli a0, a0, 27 ; RV32_NOZBB-NEXT: ret ; RV32_NOZBB-NEXT: .LBB1_2: ; RV32_NOZBB-NEXT: li a0, 16 @@ -143,14 +140,11 @@ ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 1 -; RV64NOZBB-NEXT: addiw a1, a1, -241 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: slliw a1, a0, 8 -; RV64NOZBB-NEXT: addw a0, a1, a0 -; RV64NOZBB-NEXT: slli a0, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 59 +; RV64NOZBB-NEXT: addw a0, a0, a1 +; RV64NOZBB-NEXT: andi a1, a0, 15 +; RV64NOZBB-NEXT: slli a0, a0, 52 +; RV64NOZBB-NEXT: srli a0, a0, 60 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: ret ; RV64NOZBB-NEXT: .LBB1_2: ; RV64NOZBB-NEXT: li a0, 16 @@ -606,13 +600,10 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 1 -; RV32_NOZBB-NEXT: addi a1, a1, -241 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 8 +; RV32_NOZBB-NEXT: andi a1, a0, 15 +; RV32_NOZBB-NEXT: slli a0, a0, 20 +; RV32_NOZBB-NEXT: srli a0, a0, 28 ; RV32_NOZBB-NEXT: add a0, a1, a0 -; RV32_NOZBB-NEXT: slli a0, a0, 19 -; RV32_NOZBB-NEXT: srli a0, a0, 27 ; RV32_NOZBB-NEXT: ret ; ; RV64NOZBB-LABEL: test_cttz_i16_zero_undef: @@ -632,14 +623,11 @@ ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 1 -; RV64NOZBB-NEXT: addiw a1, a1, -241 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: slliw a1, a0, 8 -; RV64NOZBB-NEXT: addw a0, a1, a0 -; RV64NOZBB-NEXT: slli a0, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 59 +; RV64NOZBB-NEXT: addw a0, a0, a1 +; RV64NOZBB-NEXT: andi a1, a0, 15 +; RV64NOZBB-NEXT: slli a0, a0, 52 +; RV64NOZBB-NEXT: srli a0, a0, 60 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_cttz_i16_zero_undef: @@ -1096,13 +1084,10 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 1 -; RV32_NOZBB-NEXT: addi a1, a1, -241 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 8 +; RV32_NOZBB-NEXT: andi a1, a0, 15 +; RV32_NOZBB-NEXT: slli a0, a0, 20 +; RV32_NOZBB-NEXT: srli a0, a0, 28 ; RV32_NOZBB-NEXT: add a0, a1, a0 -; RV32_NOZBB-NEXT: slli a0, a0, 19 -; RV32_NOZBB-NEXT: srli a0, a0, 27 ; RV32_NOZBB-NEXT: ret ; RV32_NOZBB-NEXT: .LBB9_2: ; RV32_NOZBB-NEXT: li a0, 16 @@ -1138,14 +1123,11 @@ ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 1 -; RV64NOZBB-NEXT: addiw a1, a1, -241 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: slliw a1, a0, 8 -; RV64NOZBB-NEXT: addw a0, a1, a0 -; RV64NOZBB-NEXT: slli a0, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 59 +; RV64NOZBB-NEXT: addw a0, a0, a1 +; RV64NOZBB-NEXT: andi a1, a0, 15 +; RV64NOZBB-NEXT: slli a0, a0, 52 +; RV64NOZBB-NEXT: srli a0, a0, 60 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: ret ; RV64NOZBB-NEXT: .LBB9_2: ; RV64NOZBB-NEXT: li a0, 16 @@ -1713,13 +1695,10 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 1 -; RV32_NOZBB-NEXT: addi a1, a1, -241 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 8 +; RV32_NOZBB-NEXT: andi a1, a0, 15 +; RV32_NOZBB-NEXT: slli a0, a0, 20 +; RV32_NOZBB-NEXT: srli a0, a0, 28 ; RV32_NOZBB-NEXT: add a0, a1, a0 -; RV32_NOZBB-NEXT: slli a0, a0, 19 -; RV32_NOZBB-NEXT: srli a0, a0, 27 ; RV32_NOZBB-NEXT: ret ; ; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef: @@ -1749,14 +1728,11 @@ ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 1 -; RV64NOZBB-NEXT: addiw a1, a1, -241 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: slliw a1, a0, 8 -; RV64NOZBB-NEXT: addw a0, a1, a0 -; RV64NOZBB-NEXT: slli a0, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 59 +; RV64NOZBB-NEXT: addw a0, a0, a1 +; RV64NOZBB-NEXT: andi a1, a0, 15 +; RV64NOZBB-NEXT: slli a0, a0, 52 +; RV64NOZBB-NEXT: srli a0, a0, 60 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i16_zero_undef: @@ -2251,13 +2227,10 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 1 -; RV32_NOZBB-NEXT: addi a1, a1, -241 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 8 +; RV32_NOZBB-NEXT: andi a1, a0, 15 +; RV32_NOZBB-NEXT: slli a0, a0, 20 +; RV32_NOZBB-NEXT: srli a0, a0, 28 ; RV32_NOZBB-NEXT: add a0, a1, a0 -; RV32_NOZBB-NEXT: slli a0, a0, 19 -; RV32_NOZBB-NEXT: srli a0, a0, 27 ; RV32_NOZBB-NEXT: ret ; ; RV64NOZBB-LABEL: test_ctpop_i16: @@ -2274,14 +2247,11 @@ ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 1 -; RV64NOZBB-NEXT: addiw a1, a1, -241 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: slliw a1, a0, 8 -; RV64NOZBB-NEXT: addw a0, a1, a0 -; RV64NOZBB-NEXT: slli a0, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 59 +; RV64NOZBB-NEXT: addw a0, a0, a1 +; RV64NOZBB-NEXT: andi a1, a0, 15 +; RV64NOZBB-NEXT: slli a0, a0, 52 +; RV64NOZBB-NEXT: srli a0, a0, 60 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_ctpop_i16: diff --git a/llvm/test/CodeGen/X86/parity-vec.ll b/llvm/test/CodeGen/X86/parity-vec.ll --- a/llvm/test/CodeGen/X86/parity-vec.ll +++ b/llvm/test/CodeGen/X86/parity-vec.ll @@ -64,9 +64,8 @@ ; NOPOPCNT-NEXT: addl %eax, %ecx ; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F ; NOPOPCNT-NEXT: movl %ecx, %eax -; NOPOPCNT-NEXT: shll $8, %eax -; NOPOPCNT-NEXT: addl %ecx, %eax ; NOPOPCNT-NEXT: shrl $8, %eax +; NOPOPCNT-NEXT: addl %ecx, %eax ; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax ; NOPOPCNT-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -77,9 +77,9 @@ ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: andl $3855, %ecx # imm = 0xF0F ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shll $8, %eax +; X86-NEXT: shrl $8, %eax ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %ah, %eax +; X86-NEXT: movzbl %al, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -99,9 +99,9 @@ ; X64-NEXT: addl %edi, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shll $8, %ecx +; X64-NEXT: shrl $8, %ecx ; X64-NEXT: addl %eax, %ecx -; X64-NEXT: movzbl %ch, %eax +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; @@ -1540,9 +1540,9 @@ ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: andl $3855, %ecx # imm = 0xF0F ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shll $8, %eax +; X86-NEXT: shrl $8, %eax ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %ah, %eax +; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: popcount_i16_zext: @@ -1561,9 +1561,9 @@ ; X64-NEXT: addl %edi, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shll $8, %ecx +; X64-NEXT: shrl $8, %ecx ; X64-NEXT: addl %eax, %ecx -; X64-NEXT: movzbl %ch, %eax +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: retq ; ; X86-POPCNT-LABEL: popcount_i16_zext: