Index: llvm/lib/Target/AVR/AVRISelLowering.cpp =================================================================== --- llvm/lib/Target/AVR/AVRISelLowering.cpp +++ llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -2142,16 +2142,41 @@ insertMultibyteShift(MI, BB, Registers, ShiftAmt, ArithmeticShift); // Combine the 8-bit registers into 16-bit register pairs. - BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) - .addReg(Registers[0].first, 0, Registers[0].second) - .addImm(AVR::sub_hi) - .addReg(Registers[1].first, 0, Registers[1].second) - .addImm(AVR::sub_lo); - BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) - .addReg(Registers[2].first, 0, Registers[2].second) - .addImm(AVR::sub_hi) - .addReg(Registers[3].first, 0, Registers[3].second) - .addImm(AVR::sub_lo); + // This done either from LSB to MSB or from MSB to LSB, depending on the + // shift. It's an optimization so that the register allocator will use the + // fewest movs possible (which order we use isn't a correctness issue, just an + // optimization issue). + // - lsl prefers starting from the most significant byte (2nd case). + // - lshr prefers starting from the least significant byte (1st case). + // - for ashr it depends on the number of shifted bytes. + // Some shift operations still don't get the most optimal mov sequences even + // with this distinction. TODO: figure out why and try to fix it (but we're + // already equal to or faster than avr-gcc in all cases except ashr 8). + if (ShiftAmt > 0 && (!ArithmeticShift || (ShiftAmt < 16 || ShiftAmt >= 22))) { + // Use the resulting registers starting with the least significant byte. + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) + .addReg(Registers[3].first, 0, Registers[3].second) + .addImm(AVR::sub_lo) + .addReg(Registers[2].first, 0, Registers[2].second) + .addImm(AVR::sub_hi); + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) + .addReg(Registers[1].first, 0, Registers[1].second) + .addImm(AVR::sub_lo) + .addReg(Registers[0].first, 0, Registers[0].second) + .addImm(AVR::sub_hi); + } else { + // Use the resulting registers starting with the most significant byte. + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) + .addReg(Registers[0].first, 0, Registers[0].second) + .addImm(AVR::sub_hi) + .addReg(Registers[1].first, 0, Registers[1].second) + .addImm(AVR::sub_lo); + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) + .addReg(Registers[2].first, 0, Registers[2].second) + .addImm(AVR::sub_hi) + .addReg(Registers[3].first, 0, Registers[3].second) + .addImm(AVR::sub_lo); + } // Remove the pseudo instruction. MI.eraseFromParent(); Index: llvm/test/CodeGen/AVR/shift32.ll =================================================================== --- llvm/test/CodeGen/AVR/shift32.ll +++ llvm/test/CodeGen/AVR/shift32.ll @@ -310,10 +310,9 @@ ; CHECK-NEXT: rol r24 ; CHECK-NEXT: rol r25 ; CHECK-NEXT: rol r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = lshr i32 %a, 6 @@ -329,10 +328,9 @@ ; CHECK-NEXT: rol r25 ; CHECK-NEXT: mov r19, r1 ; CHECK-NEXT: rol r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = lshr i32 %a, 7 @@ -342,12 +340,10 @@ define i32 @lshr_i32_8(i32 %a) { ; CHECK-LABEL: lshr_i32_8: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov r19, r1 -; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 +; CHECK-NEXT: mov r24, r25 +; CHECK-NEXT: mov r25, r1 ; CHECK-NEXT: ret %res = lshr i32 %a, 8 ret i32 %res @@ -356,11 +352,10 @@ define i32 @lshr_i32_24(i32 %a) { ; CHECK-LABEL: lshr_i32_24: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov r19, r1 -; CHECK-NEXT: mov r18, r1 -; CHECK-NEXT: mov r23, r1 ; CHECK-NEXT: mov r22, r25 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r25, r1 ; CHECK-NEXT: ret %res = lshr i32 %a, 24 ret i32 %res @@ -372,9 +367,9 @@ ; CHECK-NEXT: lsl r25 ; CHECK-NEXT: mov r22, r1 ; CHECK-NEXT: rol r22 -; CHECK-NEXT: mov r25, r1 -; CHECK-NEXT: mov r24, r1 ; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r25, r1 ; CHECK-NEXT: ret %res = lshr i32 %a, 31 ret i32 %res @@ -441,27 +436,25 @@ ; CHECK-NEXT: rol r24 ; CHECK-NEXT: rol r25 ; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = ashr i32 %a, 7 ret i32 %res } -; TODO: this could be optimized to 4 movs, instead of 6. +; TODO: this could be optimized to 4 movs, instead of 5. define i32 @ashr_i32_8(i32 %a) { ; CHECK-LABEL: ashr_i32_8: ; CHECK: ; %bb.0: ; CHECK-NEXT: mov r19, r25 ; CHECK-NEXT: lsl r19 ; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = ashr i32 %a, 8 @@ -485,13 +478,13 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r24 ; CHECK-NEXT: rol r25 -; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: sbc r18, r18 ; CHECK-NEXT: lsl r24 ; CHECK-NEXT: rol r25 -; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: mov r23, r18 ; CHECK-NEXT: rol r23 -; CHECK-NEXT: mov r18, r19 ; CHECK-NEXT: mov r22, r25 +; CHECK-NEXT: mov r19, r18 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = ashr i32 %a, 22 @@ -503,11 +496,10 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r24 ; CHECK-NEXT: rol r25 -; CHECK-NEXT: sbc r19, r19 -; CHECK-NEXT: mov r18, r19 -; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: sbc r23, r23 ; CHECK-NEXT: mov r22, r25 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r25, r23 ; CHECK-NEXT: ret %res = ashr i32 %a, 23 ret i32 %res @@ -517,13 +509,12 @@ ; CHECK-LABEL: ashr_i32_30: ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r25 -; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: sbc r23, r23 ; CHECK-NEXT: lsl r25 -; CHECK-NEXT: mov r22, r19 +; CHECK-NEXT: mov r22, r23 ; CHECK-NEXT: rol r22 -; CHECK-NEXT: mov r18, r19 -; CHECK-NEXT: mov r23, r19 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r25, r23 ; CHECK-NEXT: ret %res = ashr i32 %a, 30 ret i32 %res @@ -533,8 +524,8 @@ ; CHECK-LABEL: ashr_i32_31: ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r25 -; CHECK-NEXT: sbc r23, r23 -; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: sbc r22, r22 +; CHECK-NEXT: mov r23, r22 ; CHECK-NEXT: movw r24, r22 ; CHECK-NEXT: ret %res = ashr i32 %a, 31