diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -1850,16 +1850,79 @@ // registers in the Regs array. // Because AVR does not have a normal shift instruction (only a single bit shift // instruction), we have to emulate this behavior with other instructions. +// It first tries large steps (moving registers around) and then smaller steps +// like single bit shifts. +// Large shifts actually reduce the number of shifted registers, so the below +// algorithms have to work independently of the number of registers that are +// shifted. +// For more information and background, see this blogpost: +// https://aykevl.nl/2021/02/avr-bitshift static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB, MutableArrayRef> Regs, ISD::NodeType Opc, int64_t ShiftAmt) { const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); + const AVRSubtarget &STI = BB->getParent()->getSubtarget(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); const DebugLoc &dl = MI.getDebugLoc(); const bool ShiftLeft = Opc == ISD::SHL; const bool ArithmeticShift = Opc == ISD::SRA; + // Zero a register, for use in later operations. + Register ZeroReg = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg) + .addReg(STI.getZeroRegister()); + + // For shift amounts of at least one register, simply rename the registers and + // zero the bottom registers. + while (ShiftLeft && ShiftAmt >= 8) { + // Move all registers one to the left. + for (size_t I = 0; I < Regs.size() - 1; I++) { + Regs[I] = Regs[I + 1]; + } + + // Zero the least significant register. + Regs[Regs.size() - 1] = std::pair(ZeroReg, 0); + + // Continue shifts with the leftover registers. + Regs = Regs.drop_back(1); + + ShiftAmt -= 8; + } + + // And again, the same for right shifts. + Register ShrExtendReg = 0; + if (!ShiftLeft && ShiftAmt >= 8) { + if (ArithmeticShift) { + // Sign extend the most significant register into ShrExtendReg. + ShrExtendReg = MRI.createVirtualRegister(&AVR::GPR8RegClass); + Register Tmp = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Tmp) + .addReg(Regs[0].first, 0, Regs[0].second) + .addReg(Regs[0].first, 0, Regs[0].second); + BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), ShrExtendReg) + .addReg(Tmp) + .addReg(Tmp); + } else { + ShrExtendReg = ZeroReg; + } + for (; ShiftAmt >= 8; ShiftAmt -= 8) { + // Move all registers one to the right. + for (size_t I = Regs.size() - 1; I != 0; I--) { + Regs[I] = Regs[I - 1]; + } + + // Zero or sign extend the most significant register. + Regs[0] = std::pair(ShrExtendReg, 0); + + // Continue shifts with the leftover registers. + Regs = Regs.drop_front(1); + } + } + + // The bigger shifts are already handled above. + assert((ShiftAmt < 8) && "Unexpect shift amount"); + // Shift by one. This is the fallback that always works, and the shift // operation that is used for 1, 2, and 3 bit shifts. while (ShiftLeft && ShiftAmt) { diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll --- a/llvm/test/CodeGen/AVR/shift32.ll +++ b/llvm/test/CodeGen/AVR/shift32.ll @@ -29,6 +29,33 @@ ret i32 %res } +define i32 @shl_i32_8(i32 %a) { +; CHECK-LABEL: shl_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r23, r22 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 8 + ret i32 %res +} + +define i32 @shl_i32_9(i32 %a) { +; CHECK-LABEL: shl_i32_9: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r23, r22 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 9 + ret i32 %res +} + ; This is a special case: this shift is performed directly inside SelectionDAG ; instead of as a custom lowering like the other shift operations. define i32 @shl_i32_16(i32 %a) { @@ -89,6 +116,37 @@ ret i32 %res } +define i32 @lshr_i32_8(i32 %a) { +; CHECK-LABEL: lshr_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 8 + ret i32 %res +} + +define i32 @lshr_i32_9(i32 %a) { +; CHECK-LABEL: lshr_i32_9: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 9 + ret i32 %res +} + define i32 @lshr_i32_16(i32 %a) { ; CHECK-LABEL: lshr_i32_16: ; CHECK: ; %bb.0: @@ -100,6 +158,19 @@ ret i32 %res } +define i32 @lshr_i32_24(i32 %a) { +; CHECK-LABEL: lshr_i32_24: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: mov r18, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r22, r25 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 24 + ret i32 %res +} + define i32 @ashr_i32_1(i32 %a) { ; CHECK-LABEL: ashr_i32_1: ; CHECK: ; %bb.0: @@ -127,3 +198,46 @@ %res = ashr i32 %a, 2 ret i32 %res } + +; TODO: this could be optimized to 4 movs, instead of 6. +define i32 @ashr_i32_8(i32 %a) { +; CHECK-LABEL: ashr_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r19, r25 +; CHECK-NEXT: lsl r19 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 8 + ret i32 %res +} + +define i32 @ashr_i32_16(i32 %a) { +; CHECK-LABEL: ashr_i32_16: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r25, r25 +; CHECK-NEXT: mov r24, r25 +; CHECK-NEXT: ret + %res = ashr i32 %a, 16 + ret i32 %res +} + +define i32 @ashr_i32_17(i32 %a) { +; CHECK-LABEL: ashr_i32_17: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r25, r25 +; CHECK-NEXT: asr r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: mov r24, r25 +; CHECK-NEXT: ret + %res = ashr i32 %a, 17 + ret i32 %res +}