Index: llvm/lib/Target/AVR/AVRISelLowering.h =================================================================== --- llvm/lib/Target/AVR/AVRISelLowering.h +++ llvm/lib/Target/AVR/AVRISelLowering.h @@ -39,14 +39,17 @@ LSLBN, ///< Byte logical shift left N bits. LSLWN, ///< Word logical shift left N bits. LSLHI, ///< Higher 8-bit of word logical shift left. + LSLW, ///< Wide logical shift left. LSR, ///< Logical shift right. LSRBN, ///< Byte logical shift right N bits. LSRWN, ///< Word logical shift right N bits. LSRLO, ///< Lower 8-bit of word logical shift right. + LSRW, ///< Wide logical shift right. ASR, ///< Arithmetic shift right. ASRBN, ///< Byte arithmetic shift right N bits. ASRWN, ///< Word arithmetic shift right N bits. ASRLO, ///< Lower 8-bit of word arithmetic shift right. + ASRW, ///< Wide arithmetic shift right. ROR, ///< Bit rotate right. ROL, ///< Bit rotate left. LSLLOOP, ///< A loop of single logical shift left instructions. @@ -186,6 +189,8 @@ private: MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *insertWideShift(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *insertCopyR1(MachineInstr &MI, MachineBasicBlock *BB) const; Index: llvm/lib/Target/AVR/AVRISelLowering.cpp =================================================================== --- llvm/lib/Target/AVR/AVRISelLowering.cpp +++ llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -88,6 +88,9 @@ setOperationAction(ISD::SRA, MVT::i16, Custom); setOperationAction(ISD::SHL, MVT::i16, Custom); setOperationAction(ISD::SRL, MVT::i16, Custom); + setOperationAction(ISD::SRA, MVT::i32, Custom); + setOperationAction(ISD::SHL, MVT::i32, Custom); + setOperationAction(ISD::SRL, MVT::i32, Custom); setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand); setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand); setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand); @@ -247,10 +250,13 @@ NODE(CALL); NODE(WRAPPER); NODE(LSL); + NODE(LSLW); NODE(LSR); + NODE(LSRW); NODE(ROL); NODE(ROR); NODE(ASR); + NODE(ASRW); NODE(LSLLOOP); NODE(LSRLOOP); NODE(ROLLOOP); @@ -279,6 +285,40 @@ assert(isPowerOf2_32(VT.getSizeInBits()) && "Expected power-of-2 shift amount"); + if (VT.getSizeInBits() == 32) { + if (!isa(N->getOperand(1))) { + // 32-bit shifts are converted to a loop in IR. + llvm_unreachable("Expected a constant shift!"); + } + SDVTList ResTys = DAG.getVTList(MVT::i16, MVT::i16); + SDValue SrcLo = + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0), + DAG.getConstant(0, dl, MVT::i16)); + SDValue SrcHi = + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0), + DAG.getConstant(1, dl, MVT::i16)); + uint64_t ShiftAmount = + cast(N->getOperand(1))->getZExtValue(); + SDValue Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8); + unsigned Opc; + switch (Op.getOpcode()) { + default: + llvm_unreachable("Invalid 32-bit shift opcode!"); + case ISD::SHL: + Opc = AVRISD::LSLW; + break; + case ISD::SRL: + Opc = AVRISD::LSRW; + break; + case ISD::SRA: + Opc = AVRISD::ASRW; + break; + } + SDValue Result = DAG.getNode(Opc, dl, ResTys, SrcLo, SrcHi, Cnt); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Result.getValue(0), + Result.getValue(1)); + } + // Expand non-constant shifts to loops. if (!isa(N->getOperand(1))) { switch (Op.getOpcode()) { @@ -1785,6 +1825,341 @@ return RemBB; } +// Do a multibyte AVR shift. Insert shift instructions and put the output +// registers in the Regs array. +// Because AVR does not have a normal shift instruction (only a single bit shift +// instruction), we have to emulate this behavior with other instructions. +// It first tries large steps (moving registers around) and then smaller steps +// like single bit shifts. +// Large shifts actually reduce the number of shifted registers, so the below +// algorithms have to work independently of the number of registers that are +// shifted. +// For more information and background, see this blogpost: +// https://aykevl.nl/2021/02/avr-bitshift +static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB, + MutableArrayRef> Regs, + int64_t ShiftAmt, bool ArithmeticShift) { + const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + DebugLoc dl = MI.getDebugLoc(); + + // Do a shift modulo 6 or 7. This is a bit more complicated than most shifts + // and is hard to compose with the rest, so these are special cased. + // The basic idea is to shift one or two bits in the opposite direction and + // then move registers around to get the correct end result. + if (ShiftAmt < 0 && (-ShiftAmt % 8) >= 6) { + // Left shift modulo 6 or 7. + + // Create a slice of the registers we're going to modify, to ease working + // with them. + size_t ShiftRegsOffset = -ShiftAmt / 8; + size_t ShiftBytes = Regs.size() - ShiftRegsOffset; + MutableArrayRef> ShiftRegs = + Regs.slice(ShiftRegsOffset, ShiftBytes); + + // Shift one to the right, keeping the least significant bit as the carry + // bit. + insertMultibyteShift(MI, BB, ShiftRegs, 1, false); + + // Create zero register. + Register Zero = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::COPY), Zero).addReg(AVR::R1); + + // Rotate the least significant bit from the carry bit into a new register + // (that starts out zero). + Register LowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), LowByte).addReg(Zero); + + // Shift one more to the right if this is a modulo-6 shift. + if (-ShiftAmt % 8 == 6) { + insertMultibyteShift(MI, BB, ShiftRegs, 1, false); + Register NewLowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), NewLowByte).addReg(LowByte); + LowByte = NewLowByte; + } + + // Move all registers to the left, zeroing the bottom registers as needed. + for (size_t I = 0; I < Regs.size(); I++) { + int Idx = I + 1; + if (Idx < (int)ShiftRegs.size()) { + Regs[I] = ShiftRegs[Idx]; + } else if (Idx == (int)ShiftRegs.size()) { + Regs[I] = std::pair(LowByte, 0); + } else { + Regs[I] = std::pair(Zero, 0); + } + } + + return; + } + + // Right shift modulo 6 or 7. + if (ShiftAmt > 0 && (ShiftAmt % 8) >= 6) { + // Create a view on the registers we're going to modify, to ease working + // with them. + size_t ShiftBytes = Regs.size() - (ShiftAmt / 8); + MutableArrayRef> ShiftRegs = + Regs.slice(0, ShiftBytes); + + // Shift one to the left. + insertMultibyteShift(MI, BB, ShiftRegs, -1, false); + + // Sign or zero extend the most significant register into a new register. + Register Ext = MRI.createVirtualRegister(&AVR::GPR8RegClass); + Register ExtMore = 0; + if (ArithmeticShift) { + // Sign-extend bit that was shifted out last. + BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), Ext) + .addReg(Ext, RegState::Undef) + .addReg(Ext, RegState::Undef); + ExtMore = Ext; + } else { + // Create a new zero register for zero extending. + ExtMore = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ExtMore).addReg(AVR::R1); + // Rotate most significant bit into a new register (that starts out zero). + BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Ext) + .addReg(ExtMore) + .addReg(ExtMore); + } + + // Shift one more to the left for modulo 6 shifts. + if (ShiftAmt % 8 == 6) { + insertMultibyteShift(MI, BB, ShiftRegs, -1, false); + Register NewExt = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), NewExt) + .addReg(Ext) + .addReg(Ext); + Ext = NewExt; + } + + // Move all to the right, while sign or zero extending. + for (int I = Regs.size() - 1; I >= 0; I--) { + int Idx = I - (Regs.size() - ShiftRegs.size()) - 1; + if (Idx >= 0) { + Regs[I] = ShiftRegs[Idx]; + } else if (Idx == -1) { + Regs[I] = std::pair(Ext, 0); + } else { + Regs[I] = std::pair(ExtMore, 0); + } + } + + return; + } + + // For shift amounts of at least one register, simply rename the registers and + // zero the bottom registers. + auto MSBReg = Regs[0]; + Register ShrExtendReg = 0; + while (ShiftAmt <= -8) { + // Move all registers one to the left. + for (size_t I = 0; I < Regs.size() - 1; I++) { + Regs[I] = Regs[I + 1]; + } + + // Zero the least significant register. + Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::COPY), Out).addReg(AVR::R1); + Regs[Regs.size() - 1] = std::pair(Out, 0); + + // Continue shifts with the leftover registers. + Regs = Regs.slice(0, Regs.size() - 1); + + ShiftAmt += 8; + } + while (ShiftAmt >= 8) { + // Move all registers one to the right. + for (size_t I = Regs.size() - 1; I != 0; I--) { + Regs[I] = Regs[I - 1]; + } + + // Zero or sign extend the most significant register. + if (ShrExtendReg == 0) { + ShrExtendReg = MRI.createVirtualRegister(&AVR::GPR8RegClass); + if (ArithmeticShift) { + // Sign extend the most significant register into ShrExtendReg. + Register Tmp = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Tmp) + .addReg(MSBReg.first, 0, MSBReg.second) + .addReg(MSBReg.first, 0, MSBReg.second); + BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), ShrExtendReg) + .addReg(Tmp) + .addReg(Tmp); + } else { + BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ShrExtendReg).addReg(AVR::R1); + } + } + Regs[0] = std::pair(ShrExtendReg, 0); + + // Continue shifts with the leftover registers. + Regs = Regs.slice(1, Regs.size() - 1); + + ShiftAmt -= 8; + } + + // Shift by four bits, using a complicated swap/eor/andi/eor sequence. + // It only works for logical shifts because the bits shifted in are all + // zeroes. + // Example shifting 16 bits (2 bytes): + // + // ; shift r1 + // swap r1 + // andi r1, 0xf0 + // ; shift r0 + // swap r0 + // eor r1, r0 + // andi r0, 0xf0 + // eor r1, r0 + if (!ArithmeticShift && (ShiftAmt <= -4 || ShiftAmt >= 4)) { + Register Prev = 0; + for (size_t i = 0; i < Regs.size(); i++) { + size_t Idx = (ShiftAmt < 0) ? i : Regs.size() - i - 1; + Register SwapReg = MRI.createVirtualRegister(&AVR::LD8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::SWAPRd), SwapReg) + .addReg(Regs[Idx].first, 0, Regs[Idx].second); + if (Prev != 0) { + Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R) + .addReg(Prev) + .addReg(SwapReg); + Prev = R; + } + Register AndReg = MRI.createVirtualRegister(&AVR::LD8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::ANDIRdK), AndReg) + .addReg(SwapReg) + .addImm((ShiftAmt < 0) ? 0xf0 : 0x0f); + if (Prev != 0) { + Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R) + .addReg(Prev) + .addReg(AndReg); + if (ShiftAmt < 0) { // left shift + Regs[Idx - 1] = std::pair(R, 0); + } else { // right shift + Regs[Idx + 1] = std::pair(R, 0); + } + } + Prev = AndReg; + Regs[Idx] = std::pair(AndReg, 0); + } + if (ShiftAmt < 0) { + ShiftAmt += 4; + } else { + ShiftAmt -= 4; + } + } + + // Shift by one. This is the fallback that always works, and the shift + // operation that is used for 1, 2, and 3 bit shifts. + while (ShiftAmt < 0) { + // Shift one to the left. + for (size_t i = 0; i < Regs.size(); i++) { + size_t Idx = Regs.size() - i - 1; + Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass); + Register In = Regs[Idx].first; + Register InSubreg = Regs[Idx].second; + if (i == 0) { + BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Out) + .addReg(In, 0, InSubreg) + .addReg(In, 0, InSubreg); + } else { + BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Out) + .addReg(In, 0, InSubreg) + .addReg(In, 0, InSubreg); + } + Regs[Idx] = std::pair(Out, 0); + } + ShiftAmt++; + } + while (ShiftAmt > 0) { + // Shift one to the right. + for (size_t i = 0; i < Regs.size(); i++) { + Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass); + Register In = Regs[i].first; + Register InSubreg = Regs[i].second; + if (i == 0) { + unsigned Opc = ArithmeticShift ? AVR::ASRRd : AVR::LSRRd; + BuildMI(*BB, MI, dl, TII.get(Opc), Out).addReg(In, 0, InSubreg); + } else { + BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), Out).addReg(In, 0, InSubreg); + } + Regs[i] = std::pair(Out, 0); + } + ShiftAmt--; + } + + if (ShiftAmt != 0) { + llvm_unreachable("don't know how to shift!"); // sanity check + } +} + +// Do a wide (32-bit) shift. +MachineBasicBlock * +AVRTargetLowering::insertWideShift(MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + // How much to shift to the right (meaning: a negative number indicates a left + // shift). + int64_t ShiftAmt = MI.getOperand(4).getImm(); + bool ArithmeticShift = false; + switch (MI.getOpcode()) { + case AVR::Lsl32: + ShiftAmt = -ShiftAmt; + break; + case AVR::Asr32: + ArithmeticShift = true; + break; + } + + // Read the input registers, with the most significant register at index 0. + SmallVector, 4> Registers; + Registers.push_back(std::pair(MI.getOperand(3).getReg(), 1)); + Registers.push_back(std::pair(MI.getOperand(3).getReg(), 2)); + Registers.push_back(std::pair(MI.getOperand(2).getReg(), 1)); + Registers.push_back(std::pair(MI.getOperand(2).getReg(), 2)); + + // Do the shift. The registers are modified in-place. + insertMultibyteShift(MI, BB, Registers, ShiftAmt, ArithmeticShift); + + // Combine the 8-bit registers into 16-bit register pairs. + // For some reason, some right-shift instructions result in better register + // allocation with the sequence reversed. + // If we ever start splitting 16-bit pseudo instructions into 8-bit + // instructions before register allocation, this workaround probably becomes + // unnecessary. + if (MI.getOpcode() != AVR::Lsl32 && MI.getOperand(4).getImm() < 16) { + // Works better with a right shift where registers are moved once. + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) + .addReg(Registers[1].first, 0, Registers[1].second) + .addImm(2) + .addReg(Registers[0].first, 0, Registers[0].second) + .addImm(1); + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) + .addReg(Registers[3].first, 0, Registers[3].second) + .addImm(2) + .addReg(Registers[2].first, 0, Registers[2].second) + .addImm(1); + } else { + // Works better in some other cases. + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) + .addReg(Registers[0].first, 0, Registers[0].second) + .addImm(1) + .addReg(Registers[1].first, 0, Registers[1].second) + .addImm(2); + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) + .addReg(Registers[2].first, 0, Registers[2].second) + .addImm(1) + .addReg(Registers[3].first, 0, Registers[3].second) + .addImm(2); + } + + MI.eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + static bool isCopyMulResult(MachineBasicBlock::iterator const &I) { if (I->getOpcode() == AVR::COPY) { Register SrcReg = I->getOperand(1).getReg(); @@ -1897,6 +2272,10 @@ case AVR::Asr8: case AVR::Asr16: return insertShift(MI, MBB); + case AVR::Lsl32: + case AVR::Lsr32: + case AVR::Asr32: + return insertWideShift(MI, MBB); case AVR::MULRdRr: case AVR::MULSRdRr: return insertMul(MI, MBB); Index: llvm/lib/Target/AVR/AVRInstrInfo.td =================================================================== --- llvm/lib/Target/AVR/AVRInstrInfo.td +++ llvm/lib/Target/AVR/AVRInstrInfo.td @@ -69,6 +69,9 @@ def AVRlslwn : SDNode<"AVRISD::LSLWN", SDTIntBinOp>; def AVRlsrwn : SDNode<"AVRISD::LSRWN", SDTIntBinOp>; def AVRasrwn : SDNode<"AVRISD::ASRWN", SDTIntBinOp>; +def AVRlslw : SDNode<"AVRISD::LSLW", SDTIntShiftDOp>; +def AVRlsrw : SDNode<"AVRISD::LSRW", SDTIntShiftDOp>; +def AVRasrw : SDNode<"AVRISD::ASRW", SDTIntShiftDOp>; // Pseudo shift nodes for non-constant shift amounts. def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>; @@ -2309,6 +2312,11 @@ : $src, i8 : $cnt))]>; +def Lsl32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi), + (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt), + "# Lsl32 PSEUDO", + [(set i16:$dstlo, i16:$dsthi, (AVRlslw i16:$srclo, i16:$srchi, i8:$cnt))]>; + def Lsr8 : ShiftPseudo<(outs GPR8 : $dst), (ins GPR8 @@ -2329,6 +2337,11 @@ : $src, i8 : $cnt))]>; +def Lsr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi), + (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt), + "# Lsr32 PSEUDO", + [(set i16:$dstlo, i16:$dsthi, (AVRlsrw i16:$srclo, i16:$srchi, i8:$cnt))]>; + def Rol8 : ShiftPseudo<(outs GPR8 : $dst), (ins GPR8 @@ -2389,6 +2402,11 @@ : $src, i8 : $cnt))]>; +def Asr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi), + (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt), + "# Asr32 PSEUDO", + [(set i16:$dstlo, i16:$dsthi, (AVRasrw i16:$srclo, i16:$srchi, i8:$cnt))]>; + // lowered to a copy from R1, which contains the value zero. let usesCustomInserter=1 in def CopyR1 : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>; Index: llvm/test/CodeGen/AVR/avr-rust-issue-123.ll =================================================================== --- llvm/test/CodeGen/AVR/avr-rust-issue-123.ll +++ llvm/test/CodeGen/AVR/avr-rust-issue-123.ll @@ -46,10 +46,10 @@ store i8 %tmp3, i8* getelementptr inbounds (%UInt8, %UInt8* @delayFactor, i64 0, i32 0), align 1 %tmp4 = zext i8 %tmp3 to i32 %tmp5 = mul nuw nsw i32 %tmp4, 100 - ; CHECK: sts delay+3, r{{[0-9]+}} - ; CHECK-NEXT: sts delay+2, r{{[0-9]+}} - ; CHECK-NEXT: sts delay+1, r{{[0-9]+}} + ; CHECK: sts delay+1, r{{[0-9]+}} ; CHECK-NEXT: sts delay, r{{[0-9]+}} + ; CHECK-NEXT: sts delay+3, r{{[0-9]+}} + ; CHECK-NEXT: sts delay+2, r{{[0-9]+}} store i32 %tmp5, i32* getelementptr inbounds (%UInt32, %UInt32* @delay, i64 0, i32 0), align 4 tail call void @eeprom_write(i16 34, i8 %tmp3) br label %bb7 Index: llvm/test/CodeGen/AVR/return.ll =================================================================== --- llvm/test/CodeGen/AVR/return.ll +++ llvm/test/CodeGen/AVR/return.ll @@ -274,10 +274,10 @@ ; AVR-NEXT: push r29 ; AVR-NEXT: in r28, 61 ; AVR-NEXT: in r29, 62 -; AVR-NEXT: ldd r22, Y+5 -; AVR-NEXT: ldd r23, Y+6 ; AVR-NEXT: ldd r24, Y+7 ; AVR-NEXT: ldd r25, Y+8 +; AVR-NEXT: ldd r22, Y+5 +; AVR-NEXT: ldd r23, Y+6 ; AVR-NEXT: pop r29 ; AVR-NEXT: pop r28 ; AVR-NEXT: ret @@ -288,10 +288,10 @@ ; TINY-NEXT: push r29 ; TINY-NEXT: in r28, 61 ; TINY-NEXT: in r29, 62 -; TINY-NEXT: ldd r22, Y+13 -; TINY-NEXT: ldd r23, Y+14 ; TINY-NEXT: ldd r24, Y+15 ; TINY-NEXT: ldd r25, Y+16 +; TINY-NEXT: ldd r22, Y+13 +; TINY-NEXT: ldd r23, Y+14 ; TINY-NEXT: pop r29 ; TINY-NEXT: pop r28 ; TINY-NEXT: ret Index: llvm/test/CodeGen/AVR/shift32.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AVR/shift32.ll @@ -0,0 +1,422 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=avr -mattr=movw -verify-machineinstrs | FileCheck %s + +; Lowering of constant 32-bit shift instructions. +; The main reason these functions are tested separate from shift.ll is because +; of update_llc_test_checks.py. + +define i32 @shl_i32_1(i32 %a) { +; CHECK-LABEL: shl_i32_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 1 + ret i32 %res +} + +define i32 @shl_i32_2(i32 %a) { +; CHECK-LABEL: shl_i32_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 2 + ret i32 %res +} + +define i32 @shl_i32_4(i32 %a) { +; CHECK-LABEL: shl_i32_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r25 +; CHECK-NEXT: andi r25, 240 +; CHECK-NEXT: swap r24 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: andi r24, 240 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: swap r23 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: andi r23, 240 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: swap r22 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: andi r22, 240 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: ret + %res = shl i32 %a, 4 + ret i32 %res +} + +define i32 @shl_i32_5(i32 %a) { +; CHECK-LABEL: shl_i32_5: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r25 +; CHECK-NEXT: andi r25, 240 +; CHECK-NEXT: swap r24 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: andi r24, 240 +; CHECK-NEXT: eor r25, r24 +; CHECK-NEXT: swap r23 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: andi r23, 240 +; CHECK-NEXT: eor r24, r23 +; CHECK-NEXT: swap r22 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: andi r22, 240 +; CHECK-NEXT: eor r23, r22 +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 5 + ret i32 %res +} + +define i32 @shl_i32_6(i32 %a) { +; CHECK-LABEL: shl_i32_6: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: mov r18, r1 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r19, r22 +; CHECK-NEXT: movw r22, r18 +; CHECK-NEXT: ret + %res = shl i32 %a, 6 + ret i32 %res +} + + +define i32 @shl_i32_7(i32 %a) { +; CHECK-LABEL: shl_i32_7: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: mov r18, r1 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r19, r22 +; CHECK-NEXT: movw r22, r18 +; CHECK-NEXT: ret + %res = shl i32 %a, 7 + ret i32 %res +} + +define i32 @shl_i32_8(i32 %a) { +; CHECK-LABEL: shl_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r23, r22 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 8 + ret i32 %res +} + +define i32 @shl_i32_15(i32 %a) { +; CHECK-LABEL: shl_i32_15: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r18, r22 +; CHECK-NEXT: lsr r24 +; CHECK-NEXT: ror r19 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = shl i32 %a, 15 + ret i32 %res +} + +; Combined with the register allocator, shift instructions can sometimes be +; optimized away entirely. The least significant registers are simply stored +; directly instead of moving them first. +; TODO: the `mov Rd, r1` instructions are needed because most of the +; instructions are 16-bits and instructions are only split after register +; allocation. These two instructions could be avoided if the 16-bit store +; instruction was split into two 8-bit store instructions before register +; allocation. That would make this shift a no-op. +define void @shl_i32_16_ptr(i32 %a, ptr %ptr) { +; CHECK-LABEL: shl_i32_16_ptr: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r25, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: movw r30, r20 +; CHECK-NEXT: std Z+2, r22 +; CHECK-NEXT: std Z+3, r23 +; CHECK-NEXT: st Z, r24 +; CHECK-NEXT: std Z+1, r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 16 + store i32 %res, ptr %ptr + ret void +} + +define i32 @shl_i32_28(i32 %a) { +; CHECK-LABEL: shl_i32_28: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r22 +; CHECK-NEXT: andi r22, 240 +; CHECK-NEXT: mov r25, r22 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 28 + ret i32 %res +} + +define i32 @shl_i32_31(i32 %a) { +; CHECK-LABEL: shl_i32_31: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r22 +; CHECK-NEXT: mov r25, r1 +; CHECK-NEXT: ror r25 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 31 + ret i32 %res +} + +define i32 @lshr_i32_1(i32 %a) { +; CHECK-LABEL: lshr_i32_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = lshr i32 %a, 1 + ret i32 %res +} + +define i32 @lshr_i32_2(i32 %a) { +; CHECK-LABEL: lshr_i32_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = lshr i32 %a, 2 + ret i32 %res +} + +define i32 @lshr_i32_4(i32 %a) { +; CHECK-LABEL: lshr_i32_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: swap r22 +; CHECK-NEXT: andi r22, 15 +; CHECK-NEXT: swap r23 +; CHECK-NEXT: eor r22, r23 +; CHECK-NEXT: andi r23, 15 +; CHECK-NEXT: eor r22, r23 +; CHECK-NEXT: swap r24 +; CHECK-NEXT: eor r23, r24 +; CHECK-NEXT: andi r24, 15 +; CHECK-NEXT: eor r23, r24 +; CHECK-NEXT: swap r25 +; CHECK-NEXT: eor r24, r25 +; CHECK-NEXT: andi r25, 15 +; CHECK-NEXT: eor r24, r25 +; CHECK-NEXT: ret + %res = lshr i32 %a, 4 + ret i32 %res +} + +; TODO: this could be optimized to 4 movs, instead of five. +define i32 @lshr_i32_8(i32 %a) { +; CHECK-LABEL: lshr_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 8 + ret i32 %res +} + +define i32 @lshr_i32_31(i32 %a) { +; CHECK-LABEL: lshr_i32_31: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: rol r22 +; CHECK-NEXT: mov r25, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: ret + %res = lshr i32 %a, 31 + ret i32 %res +} + +define i32 @ashr_i32_1(i32 %a) { +; CHECK-LABEL: ashr_i32_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 1 + ret i32 %res +} + +define i32 @ashr_i32_2(i32 %a) { +; CHECK-LABEL: ashr_i32_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 2 + ret i32 %res +} + +; can't use the swap/andi/eor trick here +define i32 @ashr_i32_4(i32 %a) { +; CHECK-LABEL: ashr_i32_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 4 + ret i32 %res +} + +define i32 @ashr_i32_7(i32 %a) { +; CHECK-LABEL: ashr_i32_7: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 7 + ret i32 %res +} + +define i32 @ashr_i32_8(i32 %a) { +; CHECK-LABEL: ashr_i32_8: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r19, r25 +; CHECK-NEXT: lsl r19 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: mov r23, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 8 + ret i32 %res +} + +define i32 @ashr_i32_16(i32 %a) { +; CHECK-LABEL: ashr_i32_16: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r25, r25 +; CHECK-NEXT: mov r24, r25 +; CHECK-NEXT: ret + %res = ashr i32 %a, 16 + ret i32 %res +} + +define i32 @ashr_i32_23(i32 %a) { +; CHECK-LABEL: ashr_i32_23: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r18, r19 +; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: mov r22, r25 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 23 + ret i32 %res +} + +define i32 @ashr_i32_30(i32 %a) { +; CHECK-LABEL: ashr_i32_30: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: mov r22, r19 +; CHECK-NEXT: rol r22 +; CHECK-NEXT: mov r18, r19 +; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 30 + ret i32 %res +} + +define i32 @ashr_i32_31(i32 %a) { +; CHECK-LABEL: ashr_i32_31: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r23, r23 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: movw r24, r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 31 + ret i32 %res +}