diff --git a/llvm/lib/Target/AVR/AVR.h b/llvm/lib/Target/AVR/AVR.h --- a/llvm/lib/Target/AVR/AVR.h +++ b/llvm/lib/Target/AVR/AVR.h @@ -22,6 +22,7 @@ class AVRTargetMachine; class FunctionPass; +Pass *createAVRShiftExpandPass(); FunctionPass *createAVRISelDag(AVRTargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createAVRExpandPseudoPass(); @@ -30,6 +31,7 @@ FunctionPass *createAVRDynAllocaSRPass(); FunctionPass *createAVRBranchSelectionPass(); +void initializeAVRShiftExpandPass(PassRegistry &); void initializeAVRExpandPseudoPass(PassRegistry&); void initializeAVRRelaxMemPass(PassRegistry&); diff --git a/llvm/lib/Target/AVR/AVRShiftExpand.cpp b/llvm/lib/Target/AVR/AVRShiftExpand.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AVR/AVRShiftExpand.cpp @@ -0,0 +1,147 @@ +//===- AVRShift.cpp - Shift Expansion Pass --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Expand 32-bit shift instructions (shl, lshr, ashr) to inline loops, just +/// like avr-gcc. This must be done in IR because otherwise the type legalizer +/// will turn 32-bit shifts into (non-existing) library calls such as __ashlsi3. +// +//===----------------------------------------------------------------------===// + +#include "AVR.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" + +using namespace llvm; + +namespace { + +class AVRShiftExpand : public FunctionPass { +public: + static char ID; + + AVRShiftExpand() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { return "AVR Shift Expansion"; } + +private: + void expand(BinaryOperator *BI); +}; + +} // end of anonymous namespace + +char AVRShiftExpand::ID = 0; + +INITIALIZE_PASS(AVRShiftExpand, "avr-shift-expand", "AVR Shift Expansion", + false, false) + +Pass *llvm::createAVRShiftExpandPass() { return new AVRShiftExpand(); } + +bool AVRShiftExpand::runOnFunction(Function &F) { + SmallVector ShiftInsts; + auto &Ctx = F.getContext(); + for (Instruction &I : instructions(F)) { + if (!I.isShift()) + // Only expand shift instructions (shl, lshr, ashr). + continue; + if (I.getType() != Type::getInt32Ty(Ctx)) + // Only expand plain i32 types. + continue; + if (isa(I.getOperand(1))) + // Only expand when the shift amount is not known. + // Known shift amounts are (currently) better expanded inline. + continue; + ShiftInsts.push_back(cast(&I)); + } + + // The expanding itself needs to be done separately as expand() will remove + // these instructions. Removing instructions while iterating over a basic + // block is not a great idea. + for (auto *I : ShiftInsts) { + expand(I); + } + + // Return whether this function expanded any shift instructions. + return ShiftInsts.size() > 0; +} + +void AVRShiftExpand::expand(BinaryOperator *BI) { + auto &Ctx = BI->getContext(); + IRBuilder<> Builder(BI); + Type *Int32Ty = Type::getInt32Ty(Ctx); + Type *Int8Ty = Type::getInt8Ty(Ctx); + Value *Int8Zero = ConstantInt::get(Int8Ty, 0); + + // Truncate the shift amount to i8, which is trivially lowered to a single + // AVR register. + Value *ShiftAmount = Builder.CreateTrunc(BI->getOperand(1), Int8Ty); + + // Split the current basic block at the point of the existing shift + // instruction and insert a new basic block for the loop. + BasicBlock *BB = BI->getParent(); + Function *F = BB->getParent(); + BasicBlock *EndBB = BB->splitBasicBlock(BI, "shift.done"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "shift.loop", F, EndBB); + + // Replace the unconditional branch that splitBasicBlock created with a + // conditional branch. + Builder.SetInsertPoint(cast(ShiftAmount)->getNextNode()); + Value *Cmp1 = Builder.CreateICmpEQ(ShiftAmount, Int8Zero); + BranchInst *Br = Builder.CreateCondBr(Cmp1, EndBB, LoopBB); + Br->getNextNode()->eraseFromParent(); + + // Create the loop body starting with PHI nodes. + Builder.SetInsertPoint(LoopBB); + PHINode *ShiftAmountPHI = Builder.CreatePHI(Int8Ty, 2); + ShiftAmountPHI->addIncoming(ShiftAmount, BB); + PHINode *ValuePHI = Builder.CreatePHI(Int32Ty, 2); + ValuePHI->addIncoming(BI->getOperand(0), BB); + + // Subtract the shift amount by one, as we're shifting one this loop + // iteration. + Value *ShiftAmountSub = + Builder.CreateSub(ShiftAmountPHI, ConstantInt::get(Int8Ty, 1)); + ShiftAmountPHI->addIncoming(ShiftAmountSub, LoopBB); + + // Emit the actual shift instruction. The difference is that this shift + // instruction has a constant shift amount, which can be emitted inline + // without a library call. + Value *ValueShifted; + switch (BI->getOpcode()) { + case Instruction::Shl: + ValueShifted = Builder.CreateShl(ValuePHI, ConstantInt::get(Int32Ty, 1)); + break; + case Instruction::LShr: + ValueShifted = Builder.CreateLShr(ValuePHI, ConstantInt::get(Int32Ty, 1)); + break; + case Instruction::AShr: + ValueShifted = Builder.CreateAShr(ValuePHI, ConstantInt::get(Int32Ty, 1)); + break; + default: + llvm_unreachable("asked to expand an instruction that is not a shift"); + } + ValuePHI->addIncoming(ValueShifted, LoopBB); + + // Branch to either the loop again (if there is more to shift) or to the + // basic block after the loop (if all bits are shifted). + Value *Cmp2 = Builder.CreateICmpEQ(ShiftAmountSub, Int8Zero); + Builder.CreateCondBr(Cmp2, EndBB, LoopBB); + + // Collect the resulting value. This is necessary in the IR but won't produce + // any actual instructions. + Builder.SetInsertPoint(BI); + PHINode *Result = Builder.CreatePHI(Int32Ty, 2); + Result->addIncoming(BI->getOperand(0), BB); + Result->addIncoming(ValueShifted, LoopBB); + + // Replace the original shift instruction. + BI->replaceAllUsesWith(Result); + BI->eraseFromParent(); +} diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp --- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp +++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp @@ -65,6 +65,7 @@ return getTM(); } + void addIRPasses() override; bool addInstSelector() override; void addPreSched2() override; void addPreEmitPass() override; @@ -76,6 +77,15 @@ return new AVRPassConfig(*this, PM); } +void AVRPassConfig::addIRPasses() { + // Expand instructions like + // %result = shl i32 %n, %amount + // to a loop so that library calls are avoided. + addPass(createAVRShiftExpandPass()); + + TargetPassConfig::addIRPasses(); +} + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() { // Register the target. RegisterTargetMachine X(getTheAVRTarget()); @@ -83,6 +93,7 @@ auto &PR = *PassRegistry::getPassRegistry(); initializeAVRExpandPseudoPass(PR); initializeAVRRelaxMemPass(PR); + initializeAVRShiftExpandPass(PR); } const AVRSubtarget *AVRTargetMachine::getSubtargetImpl() const { diff --git a/llvm/lib/Target/AVR/CMakeLists.txt b/llvm/lib/Target/AVR/CMakeLists.txt --- a/llvm/lib/Target/AVR/CMakeLists.txt +++ b/llvm/lib/Target/AVR/CMakeLists.txt @@ -24,6 +24,7 @@ AVRMCInstLower.cpp AVRRelaxMemOperations.cpp AVRRegisterInfo.cpp + AVRShiftExpand.cpp AVRSubtarget.cpp AVRTargetMachine.cpp AVRTargetObjectFile.cpp diff --git a/llvm/test/CodeGen/AVR/shift-expand.ll b/llvm/test/CodeGen/AVR/shift-expand.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AVR/shift-expand.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -avr-shift-expand -S %s -o - | FileCheck %s + +; The avr-shift-expand pass expands large shifts with a non-constant shift +; amount to a loop. These loops avoid generating a (non-existing) builtin such +; as __ashlsi3. + +target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8" +target triple = "avr" + +define i32 @shl(i32 %value, i32 %amount) addrspace(1) { +; CHECK-LABEL: @shl( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[AMOUNT:%.*]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]] +; CHECK: shift.loop: +; CHECK-NEXT: [[TMP3:%.*]] = phi i8 [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[VALUE:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: [[TMP5]] = sub i8 [[TMP3]], 1 +; CHECK-NEXT: [[TMP6]] = shl i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]] +; CHECK: shift.done: +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[VALUE]], [[TMP0]] ], [ [[TMP6]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: ret i32 [[TMP8]] +; + %result = shl i32 %value, %amount + ret i32 %result +} + +define i32 @lshr(i32 %value, i32 %amount) addrspace(1) { +; CHECK-LABEL: @lshr( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[AMOUNT:%.*]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]] +; CHECK: shift.loop: +; CHECK-NEXT: [[TMP3:%.*]] = phi i8 [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[VALUE:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: [[TMP5]] = sub i8 [[TMP3]], 1 +; CHECK-NEXT: [[TMP6]] = lshr i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]] +; CHECK: shift.done: +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[VALUE]], [[TMP0]] ], [ [[TMP6]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: ret i32 [[TMP8]] +; + %result = lshr i32 %value, %amount + ret i32 %result +} + +define i32 @ashr(i32 %0, i32 %1) addrspace(1) { +; CHECK-LABEL: @ashr( +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP1:%.*]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i8 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]] +; CHECK: shift.loop: +; CHECK-NEXT: [[TMP5:%.*]] = phi i8 [ [[TMP3]], [[TMP2:%.*]] ], [ [[TMP7:%.*]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[TMP0:%.*]], [[TMP2]] ], [ [[TMP8:%.*]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: [[TMP7]] = sub i8 [[TMP5]], 1 +; CHECK-NEXT: [[TMP8]] = ashr i32 [[TMP6]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i8 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]] +; CHECK: shift.done: +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP0]], [[TMP2]] ], [ [[TMP8]], [[SHIFT_LOOP]] ] +; CHECK-NEXT: ret i32 [[TMP10]] +; + %3 = ashr i32 %0, %1 + ret i32 %3 +} + +; This function is not modified because it is not an i32. +define i40 @shl40(i40 %value, i40 %amount) addrspace(1) { +; CHECK-LABEL: @shl40( +; CHECK-NEXT: [[RESULT:%.*]] = shl i40 [[VALUE:%.*]], [[AMOUNT:%.*]] +; CHECK-NEXT: ret i40 [[RESULT]] +; + %result = shl i40 %value, %amount + ret i40 %result +} + +; This function isn't either, although perhaps it should. +define i24 @shl24(i24 %value, i24 %amount) addrspace(1) { +; CHECK-LABEL: @shl24( +; CHECK-NEXT: [[RESULT:%.*]] = shl i24 [[VALUE:%.*]], [[AMOUNT:%.*]] +; CHECK-NEXT: ret i24 [[RESULT]] +; + %result = shl i24 %value, %amount + ret i24 %result +}