Index: llvm/lib/Target/AVR/AVRISelLowering.cpp =================================================================== --- llvm/lib/Target/AVR/AVRISelLowering.cpp +++ llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -1923,6 +1923,54 @@ // The bigger shifts are already handled above. assert((ShiftAmt < 8) && "Unexpect shift amount"); + // Shift by four bits, using a complicated swap/eor/andi/eor sequence. + // It only works for logical shifts because the bits shifted in are all + // zeroes. + // Example shifting 16 bits (2 bytes): + // + // ; shift r1 + // swap r1 + // andi r1, 0xf0 + // ; shift r0 + // swap r0 + // eor r1, r0 + // andi r0, 0xf0 + // eor r1, r0 + if (!ArithmeticShift && ShiftAmt >= 4) { + Register Prev = 0; + for (size_t i = 0; i < Regs.size(); i++) { + size_t Idx = ShiftLeft ? i : Regs.size() - i - 1; + Register SwapReg = MRI.createVirtualRegister(&AVR::LD8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::SWAPRd), SwapReg) + .addReg(Regs[Idx].first, 0, Regs[Idx].second); + if (Prev != 0) { + Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R) + .addReg(Prev) + .addReg(SwapReg); + Prev = R; + } + Register AndReg = MRI.createVirtualRegister(&AVR::LD8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::ANDIRdK), AndReg) + .addReg(SwapReg) + .addImm(ShiftLeft ? 0xf0 : 0x0f); + if (Prev != 0) { + Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R) + .addReg(Prev) + .addReg(AndReg); + if (ShiftLeft) { + Regs[Idx - 1] = std::pair(R, 0); + } else { + Regs[Idx + 1] = std::pair(R, 0); + } + } + Prev = AndReg; + Regs[Idx] = std::pair(AndReg, 0); + } + ShiftAmt -= 4; + } + // Shift by one. This is the fallback that always works, and the shift // operation that is used for 1, 2, and 3 bit shifts. while (ShiftLeft && ShiftAmt) { Index: llvm/test/CodeGen/AVR/shift32.ll =================================================================== --- llvm/test/CodeGen/AVR/shift32.ll +++ llvm/test/CodeGen/AVR/shift32.ll @@ -29,6 +29,55 @@ ret i32 %res } +define i32 @shl_i32_4(i32 %a) { +; CHECK-LABEL: shl_i32_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r25 +; CHECK-NEXT: andi r25, 240 +; CHECK-NEXT: swap r24 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: andi r24, 240 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: swap r23 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: andi r23, 240 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: swap r22 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: andi r22, 240 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: ret + %res = shl i32 %a, 4 + ret i32 %res +} + +; shift four bits and then shift one bit +define i32 @shl_i32_5(i32 %a) { +; CHECK-LABEL: shl_i32_5: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r25 +; CHECK-NEXT: andi r25, 240 +; CHECK-NEXT: swap r24 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: andi r24, 240 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: swap r23 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: andi r23, 240 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: swap r22 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: andi r22, 240 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 5 + ret i32 %res +} + define i32 @shl_i32_8(i32 %a) { ; CHECK-LABEL: shl_i32_8: ; CHECK: ; %bb.0: @@ -56,6 +105,29 @@ ret i32 %res } +; shift 3 of 4 registers and move the others around +define i32 @shl_i32_12(i32 %a) { +; CHECK-LABEL: shl_i32_12: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r24 +; CHECK-NEXT: andi r24, 240 +; CHECK-NEXT: swap r23 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: andi r23, 240 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: swap r22 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: andi r22, 240 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r23, r22 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 12 + ret i32 %res +} + ; Combined with the register allocator, shift instructions can sometimes be ; optimized away entirely. The least significant registers are simply stored ; directly instead of moving them first. @@ -75,6 +147,21 @@ ret void } +; shift only the most significant byte and then move it +define i32 @shl_i32_28(i32 %a) { +; CHECK-LABEL: shl_i32_28: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r22 +; CHECK-NEXT: andi r22, 240 +; CHECK-NEXT: mov r25, r22 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 28 + ret i32 %res +} + define i32 @lshr_i32_1(i32 %a) { ; CHECK-LABEL: lshr_i32_1: ; CHECK: ; %bb.0: @@ -103,6 +190,28 @@ ret i32 %res } +define i32 @lshr_i32_4(i32 %a) { +; CHECK-LABEL: lshr_i32_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r22 +; CHECK-NEXT: andi r22, 15 +; CHECK-NEXT: swap r23 +; CHECK-NEXT: eor r22, r23 +; CHECK-NEXT: andi r23, 15 +; CHECK-NEXT: eor r22, r23 +; CHECK-NEXT: swap r24 +; CHECK-NEXT: eor r23, r24 +; CHECK-NEXT: andi r24, 15 +; CHECK-NEXT: eor r23, r24 +; CHECK-NEXT: swap r25 +; CHECK-NEXT: eor r24, r25 +; CHECK-NEXT: andi r25, 15 +; CHECK-NEXT: eor r24, r25 +; CHECK-NEXT: ret + %res = lshr i32 %a, 4 + ret i32 %res +} + define i32 @lshr_i32_8(i32 %a) { ; CHECK-LABEL: lshr_i32_8: ; CHECK: ; %bb.0: @@ -175,6 +284,31 @@ ret i32 %res } +; can't use the swap/andi/eor trick here +define i32 @ashr_i32_4(i32 %a) { +; CHECK-LABEL: ashr_i32_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 4 + ret i32 %res +} + ; TODO: this could be optimized to 4 movs, instead of 6. define i32 @ashr_i32_8(i32 %a) { ; CHECK-LABEL: ashr_i32_8: