diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -977,7 +977,8 @@ /// \p AddressSpace is address space of the pointer. /// \p UseMaskForCond indicates if the memory access is predicated. /// \p UseMaskForGaps indicates if gaps should be masked. - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, @@ -1341,7 +1342,8 @@ virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment) = 0; - virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + virtual int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, @@ -1763,11 +1765,12 @@ return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment); } - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, bool UseMaskForGaps) override { - return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + return Impl.getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -481,7 +481,8 @@ return 1; } - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -464,6 +464,14 @@ Constant *createSequentialMask(IRBuilderBase &Builder, unsigned Start, unsigned NumInts, unsigned NumUndefs); +/// Create a tbl1 mask with default 0xFF. +/// +/// This function creates tbl1 mask whose elements are defaults to 0xff which +/// means to fill '0' to the output vector. +Constant *createTbl1Mask(IRBuilderBase &Builder, SmallVector &InputMask, + unsigned NumElts, unsigned InputEltSize, + unsigned OutputEltSize); + /// Concatenate a list of vectors. /// /// This function generates code that concatenate the vectors in \p Vecs into a diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -914,7 +914,8 @@ return Cost; } - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2562,6 +2562,14 @@ return false; } + /// Lower a shufflevector to target specific intrinsics. Return + /// true on success. + /// + /// \p SI is the shufflevector to RE-interleave the stored vector. + virtual bool lowerShuffleVector(ShuffleVectorInst *SI) const { + return false; + } + /// Return true if zero-extending the specific node Val to type VT2 is free /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or /// because it's folded such as X86 zero-extending loads). diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -570,6 +570,9 @@ def int_aarch64_neon_st4lane : AdvSIMD_4Vec_Store_Lane_Intrinsic; let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + class AdvSIMD_Tbl1_temp_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_v16i8_ty], + [IntrNoMem]>; class AdvSIMD_Tbl1_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>; @@ -611,6 +614,7 @@ def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic; def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic; def int_aarch64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic; +def int_aarch64_neon_tbl1_temp : AdvSIMD_Tbl1_temp_Intrinsic; def int_aarch64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic; def int_aarch64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -681,11 +681,12 @@ return Cost; } -int TargetTransformInfo::getInterleavedMemoryOpCost( +int TargetTransformInfo::getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, bool UseMaskForGaps) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + int Cost = TTIImpl->getInterleavedMemoryOpCost(I, VF, + Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -726,6 +726,48 @@ return ConstantVector::get(Mask); } +Constant *llvm::createTbl1Mask(IRBuilderBase &Builder, SmallVector &InputMask, unsigned NumElts, unsigned InputEltSize, unsigned OutputEltSize) { + unsigned InputEltIdx = 0; + unsigned CurrInputIdx = 0; + unsigned CurrOffset; + unsigned OffsetLeft = 0; + unsigned OffsetRight = InputEltSize; + + SmallVector Mask; + for (unsigned i = 0; i < 16; i++) { + // if all the elements are placed in the output vector, then just fill up with out of range index + if (InputEltIdx >= NumElts) + Mask.push_back(Builder.getInt8(255)); + else { + CurrOffset = i * 8; + if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) { + CurrInputIdx = InputMask[InputEltIdx] * InputEltSize/8 + (CurrOffset-OffsetLeft)/8; + Mask.push_back(Builder.getInt8(CurrInputIdx)); + } + // finished one input element, move to the next + else if (CurrOffset == OffsetRight) { + InputEltIdx++; + if (InputEltIdx >= NumElts) { + Mask.push_back(Builder.getInt8(255)); + continue; + } + OffsetLeft = OutputEltSize * InputEltIdx; + OffsetRight = OffsetLeft + InputEltSize; + //check this new byte + if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) { + CurrInputIdx = InputMask[InputEltIdx] * InputEltSize/8 + (CurrOffset-OffsetLeft)/8; + Mask.push_back(Builder.getInt8(CurrInputIdx)); + } + else + Mask.push_back(Builder.getInt8(255)); + } + else + Mask.push_back(Builder.getInt8(255)); + } + } + return ConstantVector::get(Mask); +} + /// A helper function for concatenating vectors. This function concatenates two /// vectors having the same element type. If the second vector has fewer /// elements than the first, it is padded with undefs. diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -112,6 +112,10 @@ bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + /// Transform an type unmatched shufflevector into target specific intrinsics. + bool lowerShuffleVector(ShuffleVectorInst *SI, + SmallVector &DeadInsts); + /// Returns true if the uses of an interleaved load by the /// extractelement instructions in \p Extracts can be replaced by uses of the /// shufflevector instructions in \p Shuffles instead. If so, the necessary @@ -442,6 +446,21 @@ return true; } +bool InterleavedAccess::lowerShuffleVector( + ShuffleVectorInst *SI, SmallVector &DeadInsts) { + + LLVM_DEBUG(dbgs() << "IA: Found a shufflevector: " << *SI << "\n"); + + // Try to create target specific intrinsics to replace the shuffle. + if (!TLI->lowerShuffleVector(SI)) + return false; + + // Already have a new target specific tbl instruction. the old shufflevector. + DeadInsts.push_back(SI); + + return true; +} + bool InterleavedAccess::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC || !LowerInterleavedAccesses) @@ -469,5 +488,14 @@ for (auto I : DeadInsts) I->eraseFromParent(); + SmallVector SFDeadInsts; + for (auto &I : instructions(F)) { + if (ShuffleVectorInst *SHI = dyn_cast(&I)) + Changed |= lowerShuffleVector(SHI, SFDeadInsts); + } + + for (auto I : SFDeadInsts) + I->eraseFromParent(); + return Changed; } diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp --- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp +++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp @@ -1206,7 +1206,7 @@ SmallVector Indices; for (unsigned i = 0; i < Factor; i++) Indices.push_back(i); - InterleavedCost = TTI.getInterleavedMemoryOpCost( + InterleavedCost = TTI.getInterleavedMemoryOpCost(nullptr, 0, Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(), InsertionPoint->getPointerAddressSpace()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -3581,6 +3581,19 @@ case Intrinsic::aarch64_tagp: SelectTagP(Node); return; +#if 1 + case Intrinsic::aarch64_neon_tbl1_temp: { + SDLoc dl(Node); + + SmallVector Ops; + // the source vector + Ops.push_back(Node->getOperand(1)); + // the mask + Ops.push_back(Node->getOperand(2)); + ReplaceNode(Node, CurDAG->getMachineNode(AArch64::TBLv16i8One, dl, VT, Ops)); + return; + } +#endif case Intrinsic::aarch64_neon_tbl2: SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -423,6 +423,8 @@ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool lowerShuffleVector(ShuffleVectorInst *SI) const override; + bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9454,6 +9454,119 @@ return true; } +bool AArch64TargetLowering::lowerShuffleVector(ShuffleVectorInst *SI) const { + IRBuilder<> Builder(SI); + + // First check the shuffle_vector instruction + // 1) The first operand has to be 128 bit, byte mask requires the vector + // size has to be 16*i8. We do not handle small vector shuffle here for + // TBL1 instruction, for example, v2i16 size is 32 + // 2) The 2nd operand has to be UNDEF for tbl1 instruction + if (SI->getOperand(0)->getType()->isVectorTy() && + SI->getOperand(0)->getType()->getPrimitiveSizeInBits() != 128) + return false; + + // The 2nd operand has to be UNDEF + if (Constant * C = dyn_cast(SI->getOperand(1))) + if (!(isa(C))) + return false; + + // We only handle shuffle_vector which has only one user instruction here, + // because multiple user instructions will cause multiple tbl1 instructions + // generated. we leave it to the next stage implementation + if (!SI->hasOneUse()) + return false; + + // Now we check the one use instruction, we only handle UItoFP at this stage + // and a few other instructions. The user instruction list can also be expanded later + auto UI = SI->user_begin(); + Instruction *I = cast(*UI); + + // we only support the following instructions at this stage + // it can be expanded + if (I->getOpcode() != Instruction::UIToFP && + I->getOpcode() != Instruction::FAdd && + I->getOpcode() != Instruction::FSub && + I->getOpcode() != Instruction::FMul && + I->getOpcode() != Instruction::Add && + I->getOpcode() != Instruction::Sub) { + LLVM_DEBUG(dbgs() << "Quit Shuffle vector's user instruction not qualify : " << *I << "\n"); + return false; + } + + // Now we do the type check on the vector. + // If the type of the input vector to the user instuction is the same the output + // of the user instruction, then it is already handled in later DAG lowering + // stage, no need to to handle them here + VectorType *SVTy = SI->getType(); + if (SVTy == I->getType()) + return false; + + // At the point we exclude all the not handled situations, we can work out + // the intrinsic call + Type *SVEltTy = SVTy->getVectorElementType(); + unsigned SVNum = SVTy->getVectorNumElements(); + Type * PromotedIntTy; + //Instruction *I = cast(*UI); + + // Here we need to decide the tbl1 instruction's result type based on + // its users (UIToFP) result type + // As the result type can only be 64-bit or 32-bit float, we can set + // corresponding integer type to the tbl1's result + unsigned UIEltSize = I->getType()->getVectorElementType()->getScalarSizeInBits(); + if (UIEltSize == 64 && SVNum == 2 ) + PromotedIntTy = Type::getInt64Ty(SI->getType()->getContext()); + else if (UIEltSize == 32 && SVNum == 4) + PromotedIntTy = Type::getInt32Ty(SI->getType()->getContext()); + else + return false; + + VectorType *VecTy = VectorType::get(PromotedIntTy ,SVNum); + + //VecTy is the tbl1 result type, this needs to be worked out + //Followed by tbl1 input source vector type + Type *Tys[2] = {VecTy, SI->getOperand(0)->getType()}; + + // Get the input Mask + auto Mask = SI->getShuffleMask(); + + // Generate the intrinsic function call + Function *Tbl1Func = + Intrinsic::getDeclaration(SI->getModule(), Intrinsic::aarch64_neon_tbl1_temp, Tys); + + // Generate one Tbl1 for each use, could merge if the uses are the same + // in terms of the input type + for (auto UI = SI->user_begin(), E = SI->user_end(); UI != E; UI++) { + Instruction *I = cast(*UI); + Type *UserTy = I->getType(); + // Two operands, 1st is the Mask, 2nd one is the input vector + SmallVector Ops; + + // This is the vector operand to the Tbl1 intrisic, any vector type is OK + // however we need to adjust it to match the user result type + // we should be save to arbitarily change the type here however there could + // be a problem in later passes + Ops.push_back(SI->getOperand(0)); + + // This is the mask operand to the Tbl1 intrinsic, it has to be v16i8 type + // we need to work it out from the input mask together with the result type + // input mask is SI->getOperand[2] + // result type is the user of SI, I->getType() + unsigned InputEltSize = SVEltTy->getPrimitiveSizeInBits(); + unsigned OutputEltSize = UserTy->getVectorElementType()->getPrimitiveSizeInBits(); + Value * Tbl1_mask = createTbl1Mask(Builder, Mask, SVNum, + InputEltSize, OutputEltSize); + LLVM_DEBUG(dbgs() << "Tbl1 mask: "; Tbl1_mask->dump()); + Ops.push_back(Tbl1_mask); + + // Make the call for this user + CallInst *Tbl1 = Builder.CreateCall(Tbl1Func, Ops); + UI->replaceUsesOfWith(SI, Tbl1); + } + + // Return true if it is successful + return true; +} EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -190,7 +190,8 @@ return BaseT::isLegalNTStore(DataType, Alignment); } - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -677,7 +677,8 @@ return LT.first; } -int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, +int AArch64TTIImpl::getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, @@ -700,7 +701,32 @@ return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); } - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + // we now check to see if this interleave memory access can be lowered + // to TBL1 instruction later in the IntereleavedAccessPass + // if True, then the cost will be the number of TBL1 * the basic cost of + // TBL1 instruction which is set to 1 at this time + if (I && VF> 1 && I->hasOneUse()) { + auto UI = I->user_begin(); + Instruction *UserInstruction = cast(*UI); + // We currently just support the following instructions, can be expanded + if (UserInstruction->getOpcode() == Instruction::UIToFP || + UserInstruction->getOpcode() == Instruction::FAdd || + UserInstruction->getOpcode() == Instruction::FSub || + UserInstruction->getOpcode() == Instruction::FMul || + UserInstruction->getOpcode() == Instruction::Add || + UserInstruction->getOpcode() == Instruction::Sub + ) { + // the first check to make sure the result can form a 128-bit vector + // the 2nd check to make sure the input data can fit into 128-bit vector + // so that we can use tbl1 instruction + // there will be Group->getFactor() tbl1 generated, each tbl1 costs 1 + if ((UserInstruction->getType()->getScalarSizeInBits() * VF == 128) && + (I->getType()->getScalarSizeInBits() * Factor * VF == 128)) + return Factor * 1; + } + } + + return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5914,7 +5914,7 @@ // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); - unsigned Cost = TTI.getInterleavedMemoryOpCost( + unsigned Cost = TTI.getInterleavedMemoryOpCost(I, VF, I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);