diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -2157,16 +2157,42 @@ insertMultibyteShift(MI, BB, Registers, Opc, ShiftAmt); // Combine the 8-bit registers into 16-bit register pairs. - BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) - .addReg(Registers[0].first, 0, Registers[0].second) - .addImm(AVR::sub_hi) - .addReg(Registers[1].first, 0, Registers[1].second) - .addImm(AVR::sub_lo); - BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) - .addReg(Registers[2].first, 0, Registers[2].second) - .addImm(AVR::sub_hi) - .addReg(Registers[3].first, 0, Registers[3].second) - .addImm(AVR::sub_lo); + // This done either from LSB to MSB or from MSB to LSB, depending on the + // shift. It's an optimization so that the register allocator will use the + // fewest movs possible (which order we use isn't a correctness issue, just an + // optimization issue). + // - lsl prefers starting from the most significant byte (2nd case). + // - lshr prefers starting from the least significant byte (1st case). + // - for ashr it depends on the number of shifted bytes. + // Some shift operations still don't get the most optimal mov sequences even + // with this distinction. TODO: figure out why and try to fix it (but we're + // already equal to or faster than avr-gcc in all cases except ashr 8). + if (Opc != ISD::SHL && + (Opc != ISD::SRA || (ShiftAmt < 16 || ShiftAmt >= 22))) { + // Use the resulting registers starting with the least significant byte. + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) + .addReg(Registers[3].first, 0, Registers[3].second) + .addImm(AVR::sub_lo) + .addReg(Registers[2].first, 0, Registers[2].second) + .addImm(AVR::sub_hi); + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) + .addReg(Registers[1].first, 0, Registers[1].second) + .addImm(AVR::sub_lo) + .addReg(Registers[0].first, 0, Registers[0].second) + .addImm(AVR::sub_hi); + } else { + // Use the resulting registers starting with the most significant byte. + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) + .addReg(Registers[0].first, 0, Registers[0].second) + .addImm(AVR::sub_hi) + .addReg(Registers[1].first, 0, Registers[1].second) + .addImm(AVR::sub_lo); + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) + .addReg(Registers[2].first, 0, Registers[2].second) + .addImm(AVR::sub_hi) + .addReg(Registers[3].first, 0, Registers[3].second) + .addImm(AVR::sub_lo); + } // Remove the pseudo instruction. MI.eraseFromParent(); diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll --- a/llvm/test/CodeGen/AVR/shift32.ll +++ b/llvm/test/CodeGen/AVR/shift32.ll @@ -314,10 +314,9 @@ ; CHECK-NEXT: rol r24 ; CHECK-NEXT: rol r25 ; CHECK-NEXT: rol r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = lshr i32 %a, 6 @@ -333,10 +332,9 @@ ; CHECK-NEXT: rol r25 ; CHECK-NEXT: mov r19, r1 ; CHECK-NEXT: rol r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = lshr i32 %a, 7 @@ -346,12 +344,10 @@ define i32 @lshr_i32_8(i32 %a) { ; CHECK-LABEL: lshr_i32_8: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov r19, r1 -; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 +; CHECK-NEXT: mov r24, r25 +; CHECK-NEXT: mov r25, r1 ; CHECK-NEXT: ret %res = lshr i32 %a, 8 ret i32 %res @@ -363,12 +359,10 @@ ; CHECK-NEXT: lsr r25 ; CHECK-NEXT: ror r24 ; CHECK-NEXT: ror r23 -; CHECK-NEXT: mov r19, r1 -; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 +; CHECK-NEXT: mov r24, r25 +; CHECK-NEXT: mov r25, r1 ; CHECK-NEXT: ret %res = lshr i32 %a, 9 ret i32 %res @@ -388,11 +382,10 @@ define i32 @lshr_i32_24(i32 %a) { ; CHECK-LABEL: lshr_i32_24: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov r19, r1 -; CHECK-NEXT: mov r18, r1 -; CHECK-NEXT: mov r23, r1 ; CHECK-NEXT: mov r22, r25 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r25, r1 ; CHECK-NEXT: ret %res = lshr i32 %a, 24 ret i32 %res @@ -404,9 +397,9 @@ ; CHECK-NEXT: lsl r25 ; CHECK-NEXT: mov r22, r1 ; CHECK-NEXT: rol r22 -; CHECK-NEXT: mov r25, r1 -; CHECK-NEXT: mov r24, r1 ; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r25, r1 ; CHECK-NEXT: ret %res = lshr i32 %a, 31 ret i32 %res @@ -473,27 +466,25 @@ ; CHECK-NEXT: rol r24 ; CHECK-NEXT: rol r25 ; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = ashr i32 %a, 7 ret i32 %res } -; TODO: this could be optimized to 4 movs, instead of 6. +; TODO: this could be optimized to 4 movs, instead of 5. define i32 @ashr_i32_8(i32 %a) { ; CHECK-LABEL: ashr_i32_8: ; CHECK: ; %bb.0: ; CHECK-NEXT: mov r19, r25 ; CHECK-NEXT: lsl r19 ; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 ; CHECK-NEXT: mov r18, r25 -; CHECK-NEXT: mov r25, r24 -; CHECK-NEXT: mov r24, r23 -; CHECK-NEXT: movw r22, r24 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = ashr i32 %a, 8 @@ -531,13 +522,13 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r24 ; CHECK-NEXT: rol r25 -; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: sbc r18, r18 ; CHECK-NEXT: lsl r24 ; CHECK-NEXT: rol r25 -; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: mov r23, r18 ; CHECK-NEXT: rol r23 -; CHECK-NEXT: mov r18, r19 ; CHECK-NEXT: mov r22, r25 +; CHECK-NEXT: mov r19, r18 ; CHECK-NEXT: movw r24, r18 ; CHECK-NEXT: ret %res = ashr i32 %a, 22 @@ -549,11 +540,10 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r24 ; CHECK-NEXT: rol r25 -; CHECK-NEXT: sbc r19, r19 -; CHECK-NEXT: mov r18, r19 -; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: sbc r23, r23 ; CHECK-NEXT: mov r22, r25 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r25, r23 ; CHECK-NEXT: ret %res = ashr i32 %a, 23 ret i32 %res @@ -563,13 +553,12 @@ ; CHECK-LABEL: ashr_i32_30: ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r25 -; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: sbc r23, r23 ; CHECK-NEXT: lsl r25 -; CHECK-NEXT: mov r22, r19 +; CHECK-NEXT: mov r22, r23 ; CHECK-NEXT: rol r22 -; CHECK-NEXT: mov r18, r19 -; CHECK-NEXT: mov r23, r19 -; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r25, r23 ; CHECK-NEXT: ret %res = ashr i32 %a, 30 ret i32 %res @@ -579,8 +568,8 @@ ; CHECK-LABEL: ashr_i32_31: ; CHECK: ; %bb.0: ; CHECK-NEXT: lsl r25 -; CHECK-NEXT: sbc r23, r23 -; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: sbc r22, r22 +; CHECK-NEXT: mov r23, r22 ; CHECK-NEXT: movw r24, r22 ; CHECK-NEXT: ret %res = ashr i32 %a, 31