Index: llvm/lib/Target/AVR/AVRISelLowering.cpp =================================================================== --- llvm/lib/Target/AVR/AVRISelLowering.cpp +++ llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -1833,13 +1833,75 @@ // registers in the Regs array. // Because AVR does not have a normal shift instruction (only a single bit shift // instruction), we have to emulate this behavior with other instructions. +// It first tries large steps (moving registers around) and then smaller steps +// like single bit shifts. +// Large shifts actually reduce the number of shifted registers, so the below +// algorithms have to work independently of the number of registers that are +// shifted. +// For more information and background, see this blogpost: +// https://aykevl.nl/2021/02/avr-bitshift static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB, MutableArrayRef> Regs, int64_t ShiftAmt, bool ArithmeticShift) { const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); + const AVRSubtarget &STI = BB->getParent()->getSubtarget(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); DebugLoc dl = MI.getDebugLoc(); + // For shift amounts of at least one register, simply rename the registers and + // zero the bottom registers. + auto MSBReg = Regs[0]; + Register ShrExtendReg = 0; + while (ShiftAmt <= -8) { + // Move all registers one to the left. + for (size_t I = 0; I < Regs.size() - 1; I++) { + Regs[I] = Regs[I + 1]; + } + + // Zero the least significant register. + Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::COPY), Out).addReg(STI.getZeroRegister()); + Regs[Regs.size() - 1] = std::pair(Out, 0); + + // Continue shifts with the leftover registers. + Regs = Regs.drop_back(1); + + ShiftAmt += 8; + } + while (ShiftAmt >= 8) { + // Move all registers one to the right. + for (size_t I = Regs.size() - 1; I != 0; I--) { + Regs[I] = Regs[I - 1]; + } + + // Zero or sign extend the most significant register. + if (ShrExtendReg == 0) { + ShrExtendReg = MRI.createVirtualRegister(&AVR::GPR8RegClass); + if (ArithmeticShift) { + // Sign extend the most significant register into ShrExtendReg. + Register Tmp = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Tmp) + .addReg(MSBReg.first, 0, MSBReg.second) + .addReg(MSBReg.first, 0, MSBReg.second); + BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), ShrExtendReg) + .addReg(Tmp) + .addReg(Tmp); + } else { + BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ShrExtendReg) + .addReg(STI.getZeroRegister()); + } + } + Regs[0] = std::pair(ShrExtendReg, 0); + + // Continue shifts with the leftover registers. + Regs = Regs.drop_front(1); + + ShiftAmt -= 8; + } + + // The bigger shifts are already handled above. + assert((ShiftAmt >= -8 && ShiftAmt <= 8) && "Unexpect shift amount"); + // Shift by one. This is the fallback that always works, and the shift // operation that is used for 1, 2, and 3 bit shifts. while (ShiftAmt < 0) { Index: llvm/test/CodeGen/AVR/avr-rust-issue-123.ll =================================================================== --- llvm/test/CodeGen/AVR/avr-rust-issue-123.ll +++ llvm/test/CodeGen/AVR/avr-rust-issue-123.ll @@ -48,7 +48,7 @@ %tmp5 = mul nuw nsw i32 %tmp4, 100 ; CHECK: sts delay+1, r{{[0-9]+}} ; CHECK-NEXT: sts delay, r{{[0-9]+}} - ; CHECK: sts delay+3, r{{[0-9]+}} + ; CHECK-NEXT: sts delay+3, r{{[0-9]+}} ; CHECK-NEXT: sts delay+2, r{{[0-9]+}} store i32 %tmp5, i32* getelementptr inbounds (%UInt32, %UInt32* @delay, i64 0, i32 0), align 4 tail call void @eeprom_write(i16 34, i8 %tmp3) Index: llvm/test/CodeGen/AVR/div.ll =================================================================== --- llvm/test/CodeGen/AVR/div.ll +++ llvm/test/CodeGen/AVR/div.ll @@ -44,7 +44,9 @@ define i32 @udiv32(i32 %a, i32 %b) { ; CHECK-LABEL: udiv32: ; CHECK: call __udivmodsi4 -; CHECK: ret +; CHECK-NEXT: movw r22, r18 +; CHECK-NEXT: movw r24, r20 +; CHECK-NEXT: ret %quot = udiv i32 %a, %b ret i32 %quot } @@ -53,7 +55,9 @@ define i32 @sdiv32(i32 %a, i32 %b) { ; CHECK-LABEL: sdiv32: ; CHECK: call __divmodsi4 -; CHECK: ret +; CHECK-NEXT: movw r22, r18 +; CHECK-NEXT: movw r24, r20 +; CHECK-NEXT: ret %quot = sdiv i32 %a, %b ret i32 %quot } Index: llvm/test/CodeGen/AVR/rem.ll =================================================================== --- llvm/test/CodeGen/AVR/rem.ll +++ llvm/test/CodeGen/AVR/rem.ll @@ -42,7 +42,7 @@ define i32 @urem32(i32 %a, i32 %b) { ; CHECK-LABEL: urem32: ; CHECK: call __udivmodsi4 -; CHECK: ret +; CHECK-NEXT: ret %rem = urem i32 %a, %b ret i32 %rem } @@ -51,7 +51,7 @@ define i32 @srem32(i32 %a, i32 %b) { ; CHECK-LABEL: srem32: ; CHECK: call __divmodsi4 -; CHECK: ret +; CHECK-NEXT: ret %rem = srem i32 %a, %b ret i32 %rem } Index: llvm/test/CodeGen/AVR/return.ll =================================================================== --- llvm/test/CodeGen/AVR/return.ll +++ llvm/test/CodeGen/AVR/return.ll @@ -390,76 +390,10 @@ ; AVR-NEXT: push r29 ; AVR-NEXT: in r28, 61 ; AVR-NEXT: in r29, 62 -; AVR-NEXT: ldd r18, Y+7 -; AVR-NEXT: ldd r19, Y+8 +; AVR-NEXT: ldd r24, Y+7 +; AVR-NEXT: ldd r25, Y+8 ; AVR-NEXT: ldd r22, Y+5 ; AVR-NEXT: ldd r23, Y+6 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: mov r25, r23 -; AVR-NEXT: ror r25 -; AVR-NEXT: mov r24, r22 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 -; AVR-NEXT: lsr r19 -; AVR-NEXT: ror r18 -; AVR-NEXT: ror r25 -; AVR-NEXT: ror r24 ; AVR-NEXT: pop r29 ; AVR-NEXT: pop r28 ; AVR-NEXT: ret @@ -473,8 +407,8 @@ ; TINY-NEXT: in r16, 63 ; TINY-NEXT: subi r28, 241 ; TINY-NEXT: sbci r29, 255 -; TINY-NEXT: ld r20, Y+ -; TINY-NEXT: ld r21, Y+ +; TINY-NEXT: ld r24, Y+ +; TINY-NEXT: ld r25, Y+ ; TINY-NEXT: subi r28, 2 ; TINY-NEXT: sbci r29, 0 ; TINY-NEXT: subi r28, 15 @@ -490,72 +424,6 @@ ; TINY-NEXT: subi r28, 13 ; TINY-NEXT: sbci r29, 0 ; TINY-NEXT: out 63, r16 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: mov r25, r23 -; TINY-NEXT: ror r25 -; TINY-NEXT: mov r24, r22 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 -; TINY-NEXT: lsr r21 -; TINY-NEXT: ror r20 -; TINY-NEXT: ror r25 -; TINY-NEXT: ror r24 ; TINY-NEXT: pop r29 ; TINY-NEXT: pop r28 ; TINY-NEXT: ret Index: llvm/test/CodeGen/AVR/shift32.ll =================================================================== --- llvm/test/CodeGen/AVR/shift32.ll +++ llvm/test/CodeGen/AVR/shift32.ll @@ -33,6 +33,57 @@ ret i32 %res } +define i32 @shl_i32_8(i32 %a) { +; CHECK-LABEL: shl_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r23, r22 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 8 + ret i32 %res +} + +define i32 @shl_i32_9(i32 %a) { +; CHECK-LABEL: shl_i32_9: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r23, r22 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 9 + ret i32 %res +} + +; Combined with the register allocator, shift instructions can sometimes be +; optimized away entirely. The least significant registers are simply stored +; directly instead of moving them first. +; TODO: the `mov Rd, r1` instructions are needed because most of the +; instructions are 16-bits and instructions are only split after register +; allocation. These two instructions could be avoided if the 16-bit store +; instruction was split into two 8-bit store instructions before register +; allocation. That would make this shift a no-op. +define void @shl_i32_16_ptr(i32 %a, ptr %ptr) { +; CHECK-LABEL: shl_i32_16_ptr: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r25, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: movw r30, r20 +; CHECK-NEXT: std Z+2, r22 +; CHECK-NEXT: std Z+3, r23 +; CHECK-NEXT: st Z, r24 +; CHECK-NEXT: std Z+1, r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 16 + store i32 %res, ptr %ptr + ret void +} + define i32 @lshr_i32_1(i32 %a) { ; CHECK-LABEL: lshr_i32_1: ; CHECK: ; %bb.0: @@ -61,6 +112,33 @@ ret i32 %res } +define i32 @lshr_i32_8(i32 %a) { +; CHECK-LABEL: lshr_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 8 + ret i32 %res +} + +define i32 @lshr_i32_24(i32 %a) { +; CHECK-LABEL: lshr_i32_24: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: mov r18, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r22, r25 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 24 + ret i32 %res +} + define i32 @ashr_i32_1(i32 %a) { ; CHECK-LABEL: ashr_i32_1: ; CHECK: ; %bb.0: @@ -88,3 +166,32 @@ %res = ashr i32 %a, 2 ret i32 %res } + +; TODO: this could be optimized to 4 movs, instead of 6. +define i32 @ashr_i32_8(i32 %a) { +; CHECK-LABEL: ashr_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r19, r25 +; CHECK-NEXT: lsl r19 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 8 + ret i32 %res +} + +define i32 @ashr_i32_16(i32 %a) { +; CHECK-LABEL: ashr_i32_16: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r25, r25 +; CHECK-NEXT: mov r24, r25 +; CHECK-NEXT: ret + %res = ashr i32 %a, 16 + ret i32 %res +}