diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -938,7 +938,8 @@ /// \p AddressSpace is address space of the pointer. /// \p UseMaskForCond indicates if the memory access is predicated. /// \p UseMaskForGaps indicates if gaps should be masked. - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, + Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, @@ -1319,7 +1320,8 @@ bool VariableMask, unsigned Alignment, const Instruction *I = nullptr) = 0; virtual int - getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, + Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; @@ -1731,12 +1733,13 @@ return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, I); } - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, + Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, bool UseMaskForGaps) override { - return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return Impl.getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); } int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -445,7 +445,8 @@ return 1; } - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -900,7 +900,8 @@ return Cost; } - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2575,6 +2575,12 @@ return false; } + /// Lower a shufflevector to target specific intrinsics. Return + /// true on success. + /// + /// \p SI is the shufflevector to RE-interleave the stored vector. + virtual bool lowerShuffleVector(ShuffleVectorInst *SI) const { return false; } + /// Return true if zero-extending the specific node Val to type VT2 is free /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or /// because it's folded such as X86 zero-extending loads). diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -570,6 +570,10 @@ def int_aarch64_neon_st4lane : AdvSIMD_4Vec_Store_Lane_Intrinsic; let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + class AdvSIMD_Tbl1_temp_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_v16i8_ty], + [IntrNoMem]>; + class AdvSIMD_Tbl1_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>; @@ -607,6 +611,7 @@ llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>; } +def int_aarch64_neon_tbl1_temp : AdvSIMD_Tbl1_temp_Intrinsic; def int_aarch64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic; def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic; def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -669,12 +669,12 @@ } int TargetTransformInfo::getInterleavedMemoryOpCost( - unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, - bool UseMaskForGaps) const { + Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, + bool UseMaskForCond, bool UseMaskForGaps) const { int Cost = TTIImpl->getInterleavedMemoryOpCost( - Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, UseMaskForCond, - UseMaskForGaps); + I, VF, Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -112,6 +112,10 @@ bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + /// Transform an type unmatched shufflevector into target specific intrinsics. + bool lowerShuffleVector(ShuffleVectorInst *SI, + SmallVector &DeadInsts); + /// Returns true if the uses of an interleaved load by the /// extractelement instructions in \p Extracts can be replaced by uses of the /// shufflevector instructions in \p Shuffles instead. If so, the necessary @@ -443,6 +447,22 @@ return true; } +bool InterleavedAccess::lowerShuffleVector( + ShuffleVectorInst *SI, SmallVector &DeadInsts) { + + LLVM_DEBUG(dbgs() << "IA: Found a shufflevector: " << *SI << "\n"); + + // Try to create target specific intrinsics to replace the shuffle. + if (!TLI->lowerShuffleVector(SI)) + return false; + + // Already have a new target specific tbl instruction. Erase the old + // shufflevector. + DeadInsts.push_back(SI); + + return true; +} + bool InterleavedAccess::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC || !LowerInterleavedAccesses) @@ -470,5 +490,14 @@ for (auto I : DeadInsts) I->eraseFromParent(); + SmallVector SFDeadInsts; + for (auto &I : instructions(F)) { + if (ShuffleVectorInst *SHI = dyn_cast(&I)) + Changed |= lowerShuffleVector(SHI, SFDeadInsts); + } + + for (auto *I : SFDeadInsts) + I->eraseFromParent(); + return Changed; } diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp --- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp +++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp @@ -1207,8 +1207,8 @@ for (unsigned i = 0; i < Factor; i++) Indices.push_back(i); InterleavedCost = TTI.getInterleavedMemoryOpCost( - Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(), - InsertionPoint->getPointerAddressSpace()); + nullptr, 0, Instruction::Load, ILTy, Factor, Indices, + InsertionPoint->getAlignment(), InsertionPoint->getPointerAddressSpace()); if (InterleavedCost >= InstructionCost) { return false; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -3678,6 +3678,19 @@ case Intrinsic::aarch64_tagp: SelectTagP(Node); return; + case Intrinsic::aarch64_neon_tbl1_temp: { + SDLoc Dl(Node); + + SmallVector Ops; + // the source vector + Ops.push_back(Node->getOperand(1)); + // the mask + Ops.push_back(Node->getOperand(2)); + ReplaceNode(Node, + CurDAG->getMachineNode(AArch64::TBLv16i8One, Dl, VT, Ops)); + + return; + } case Intrinsic::aarch64_neon_tbl2: SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -457,6 +457,7 @@ unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool lowerShuffleVector(ShuffleVectorInst *SI) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; @@ -867,6 +868,13 @@ bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const override; + + /// Create a tbl1 mask with default 0xFF. + /// This function creates tbl1 mask whose elements are defaults to 0xff which + /// means to fill '0' to the output vector. + Constant *createTbl1Mask(IRBuilderBase &Builder, ArrayRef InputMask, + unsigned NumElts, unsigned InputEltSize, + unsigned OutputEltSize) const; }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9622,6 +9622,123 @@ return true; } +bool AArch64TargetLowering::lowerShuffleVector(ShuffleVectorInst *SI) const { + IRBuilder<> Builder(SI); + + // First check the shuffle_vector instruction + // 1) The first operand has to be 128 bit, byte mask requires the vector + // size has to be 16*i8. We do not handle small vector shuffle here for + // TBL1 instruction, for example, v2i16 size is 32 + // 2) The 2nd operand has to be UNDEF for tbl1 instruction + if (SI->getOperand(0)->getType()->isVectorTy() && + SI->getOperand(0)->getType()->getPrimitiveSizeInBits() != 128) + return false; + + // The 2nd operand has to be UNDEF + if (Constant *C = dyn_cast(SI->getOperand(1))) + if (!(isa(C))) + return false; + + // We only handle shuffle_vector which has only one user instruction here, + // because multiple user instructions will cause multiple tbl1 instructions + // generated. we leave it to the next stage implementation + if (!SI->hasOneUse()) + return false; + + // Now we check the one use instruction, we only handle UItoFP at this stage + // and a few other instructions. The user instruction list can also be + // expanded later + auto UI = SI->user_begin(); + Instruction *I = cast(*UI); + + // we only support the following instructions at this stage + // it can be expanded + if (I->getOpcode() != Instruction::UIToFP && + I->getOpcode() != Instruction::FAdd && + I->getOpcode() != Instruction::FSub && + I->getOpcode() != Instruction::FMul && + I->getOpcode() != Instruction::Add && + I->getOpcode() != Instruction::Sub) { + LLVM_DEBUG(dbgs() << "Quit Shuffle vector's user instruction not qualify : " + << *I << "\n"); + return false; + } + + // Now we do the type check on the vector. + // If the type of the input vector to the user instuction is the same the + // output of the user instruction, then it is already handled in later DAG + // lowering stage, no need to to handle them here + VectorType *SVTy = SI->getType(); + if (SVTy == I->getType()) + return false; + + // At the point we exclude all the not handled situations, we can work out + // the intrinsic call + Type *SVEltTy = SVTy->getElementType(); + unsigned SVNum = SVTy->getNumElements(); + Type *PromotedIntTy; + + // Here we need to decide the tbl1 instruction's result type based on + // its users (UIToFP) result type + // As the result type can only be 64-bit or 32-bit float, we can set + // corresponding integer type to the tbl1's result + unsigned UIEltSize = + I->getType()->getArrayElementType()->getScalarSizeInBits(); + if (UIEltSize == 64 && SVNum == 2) + PromotedIntTy = Type::getInt64Ty(SI->getType()->getContext()); + else if (UIEltSize == 32 && SVNum == 4) + PromotedIntTy = Type::getInt32Ty(SI->getType()->getContext()); + else + return false; + + VectorType *VecTy = VectorType::get(PromotedIntTy, SVNum); + + // VecTy is the tbl1 result type, this needs to be worked out + // Followed by tbl1 input source vector type + Type *Tys[2] = {VecTy, SI->getOperand(0)->getType()}; + + // Get the input Mask + auto Mask = SI->getShuffleMask(); + + // Generate the intrinsic function call + Function *Tbl1Func = Intrinsic::getDeclaration( + SI->getModule(), Intrinsic::aarch64_neon_tbl1_temp, Tys); + + // Generate one Tbl1 for each use, could merge if the uses are the same + // in terms of the input type + for (auto UI = SI->user_begin(), E = SI->user_end(); UI != E; UI++) { + Instruction *I = cast(*UI); + Type *UserTy = I->getType(); + + // Two operands, 1st is the Mask, 2nd one is the input vector + SmallVector Ops; + + // This is the vector operand to the Tbl1 intrisic, any vector type is OK + // however we need to adjust it to match the user result type + // we should be save to arbitarily change the type here however there could + // be a problem in later passes + Ops.push_back(SI->getOperand(0)); + + // This is the mask operand to the Tbl1 intrinsic, it has to be v16i8 type + // we need to work it out from the input mask together with the result type + // input mask is SI->getOperand[2] + // result type is the user of SI, I->getType() + unsigned InputEltSize = SVEltTy->getPrimitiveSizeInBits(); + unsigned OutputEltSize = + UserTy->getArrayElementType()->getPrimitiveSizeInBits(); + Value *Tbl1mask = + createTbl1Mask(Builder, Mask, SVNum, InputEltSize, OutputEltSize); + LLVM_DEBUG(dbgs() << "Tbl1 mask: "; Tbl1mask->dump()); + Ops.push_back(Tbl1mask); + + // Make the call for this user + CallInst *Tbl1 = Builder.CreateCall(Tbl1Func, Ops); + UI->replaceUsesOfWith(SI, Tbl1); + } + + // Return true if it is successful + return true; +} EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { @@ -14043,3 +14160,51 @@ } return TargetLoweringBase::shouldLocalize(MI, TTI); } + +Constant *AArch64TargetLowering::createTbl1Mask(IRBuilderBase &Builder, + ArrayRef InputMask, + unsigned NumElts, + unsigned InputEltSize, + unsigned OutputEltSize) const { + + unsigned InputEltIdx = 0; + unsigned CurrInputIdx = 0; + unsigned CurrOffset; + unsigned OffsetLeft = 0; + unsigned OffsetRight = InputEltSize; + + SmallVector Mask; + for (unsigned Idx = 0; Idx < 16; Idx++) { + // if all the elements are placed in the output vector, then just fill up + // with out of range index + if (InputEltIdx >= NumElts) + Mask.push_back(Builder.getInt8(255)); + else { + CurrOffset = Idx * 8; + if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) { + CurrInputIdx = InputMask[InputEltIdx] * InputEltSize / 8 + + (CurrOffset - OffsetLeft) / 8; + Mask.push_back(Builder.getInt8(CurrInputIdx)); + } + // finished one input element, move to the next + else if (CurrOffset == OffsetRight) { + InputEltIdx++; + if (InputEltIdx >= NumElts) { + Mask.push_back(Builder.getInt8(255)); + continue; + } + OffsetLeft = OutputEltSize * InputEltIdx; + OffsetRight = OffsetLeft + InputEltSize; + // check this new byte + if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) { + CurrInputIdx = InputMask[InputEltIdx] * InputEltSize / 8 + + (CurrOffset - OffsetLeft) / 8; + Mask.push_back(Builder.getInt8(CurrInputIdx)); + } else + Mask.push_back(Builder.getInt8(255)); + } else + Mask.push_back(Builder.getInt8(255)); + } + } + return ConstantVector::get(Mask); +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -189,7 +189,8 @@ return BaseT::isLegalNTStore(DataType, Alignment); } - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, + Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -683,13 +683,11 @@ return LT.first; } -int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int AArch64TTIImpl::getInterleavedMemoryOpCost( + Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, + bool UseMaskForCond, bool UseMaskForGaps) { + assert(Factor >= 2 && "Invalid interleave factor"); auto *VecVTy = cast(VecTy); @@ -706,8 +704,32 @@ return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); } - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + // we now check to see if this interleave memory access can be lowered + // to TBL1 instruction later in the IntereleavedAccessPass + // if True, then the cost will be the number of TBL1 * the basic cost of + // TBL1 instruction which is set to 1 at this time + if (I && VF > 1 && I->hasOneUse()) { + auto UI = I->user_begin(); + Instruction *UserInstruction = cast(*UI); + // We currently just support the following instructions, can be expanded + if (UserInstruction->getOpcode() == Instruction::UIToFP || + UserInstruction->getOpcode() == Instruction::FAdd || + UserInstruction->getOpcode() == Instruction::FSub || + UserInstruction->getOpcode() == Instruction::FMul || + UserInstruction->getOpcode() == Instruction::Add || + UserInstruction->getOpcode() == Instruction::Sub) { + // the first check to make sure the result can form a 128-bit vector + // the 2nd check to make sure the input data can fit into 128-bit vector + // so that we can use tbl1 instruction + // there will be Group->getFactor() tbl1 generated, each tbl1 costs 1 + if ((UserInstruction->getType()->getScalarSizeInBits() * VF == 128) && + (I->getType()->getScalarSizeInBits() * Factor * VF == 128)) + return Factor * 1; + } + } + + return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -216,7 +216,8 @@ int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I = nullptr); - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, + Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -823,9 +823,9 @@ } int ARMTTIImpl::getInterleavedMemoryOpCost( - unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, - bool UseMaskForGaps) { + Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, + bool UseMaskForCond, bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); @@ -855,8 +855,8 @@ return 2 * BaseCost; } - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -124,10 +124,13 @@ unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment, const Instruction *I); - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool UseMaskForCond = false, - bool UseMaskForGaps = false); + unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF, + unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace, + bool UseMaskForCond = false, + bool UseMaskForGaps = false); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost( diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -220,13 +220,13 @@ Alignment, I); } -unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, - Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, - bool UseMaskForGaps) { +unsigned HexagonTTIImpl::getInterleavedMemoryOpCost( + Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, + bool UseMaskForCond, bool UseMaskForGaps) { if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, nullptr); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -104,10 +104,9 @@ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I = nullptr); - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, + Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, bool UseMaskForGaps = false); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -907,16 +907,13 @@ return Cost; } -int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int PPCTTIImpl::getInterleavedMemoryOpCost( + Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, + bool UseMaskForCond, bool UseMaskForGaps) { if (UseMaskForCond || UseMaskForGaps) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); assert(isa(VecTy) && diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -96,10 +96,9 @@ int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I = nullptr); - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VFactor, + unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, bool UseMaskForGaps = false); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -1072,16 +1072,13 @@ // needed for using / defining the vector operands. The SystemZ version does // roughly the same but bases the computations on vector permutations // instead. -int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int SystemZTTIImpl::getInterleavedMemoryOpCost( + Instruction *I, unsigned VFactor, unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef Indices, unsigned Alignment, + unsigned AddressSpace, bool UseMaskForCond, bool UseMaskForGaps) { if (UseMaskForCond || UseMaskForGaps) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return BaseT::getInterleavedMemoryOpCost(I, VFactor, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -161,9 +161,10 @@ int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned); - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, + int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode, + Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, + unsigned AddressSpace, bool UseMaskForCond = false, bool UseMaskForGaps = false); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3904,15 +3904,15 @@ bool UseMaskForGaps) { if (UseMaskForCond || UseMaskForGaps) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). if (Indices.size() && Indices.size() != Factor) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace); // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -3923,8 +3923,8 @@ // the VF=2, while v2i128 is an unsupported MVT vector type // (see MachineValueType.h::getVectorVT()). if (!LegalVT.isVector()) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace); unsigned VF = cast(VecTy)->getNumElements() / Factor; Type *ScalarTy = cast(VecTy)->getElementType(); @@ -3945,8 +3945,8 @@ VectorType *VT = VectorType::get(ScalarTy, VF); EVT ETy = TLI->getValueType(DL, VT); if (!ETy.isSimple()) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace); // TODO: Complete for other data-types and strides. // Each combination of Stride, ElementTy and VF results in a different @@ -4004,8 +4004,8 @@ return NumOfMemOps * MemOpCost + Entry->Cost; } - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace); } // Get estimation for interleaved load/store operations and strided load. @@ -4021,8 +4021,8 @@ bool UseMaskForGaps) { if (UseMaskForCond || UseMaskForGaps) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); // VecTy for interleave memop is . @@ -4139,13 +4139,11 @@ return Cost; } -int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int X86TTIImpl::getInterleavedMemoryOpCost( + Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, + bool UseMaskForCond, bool UseMaskForGaps) { + auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = cast(VecTy)->getElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -4164,7 +4162,7 @@ Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, + Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5938,7 +5938,7 @@ bool UseMaskForGaps = Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + I, VF, I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) {