Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -9833,6 +9833,124 @@ %res = select <16 x i1> %mask, <16 x float> %value, <16 x float> %oldval store <16 x float> %res, <16 x float>* %ptr, align 4 +Indexed Vector Load and Store Intrinsics +--------------------------------------- + +LLVM provides intrinsics for indexed vector load and store operations, which allow read/write access to multiple memory addresses. +The addresses are specified by a base address and an index vector. + +.. _int_iload: + +'``llvm.indexed.load.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type. + +:: + + declare <16 x i32> @llvm.indexed.load.v16i32 (i32* , <16 x i32> , i32 ) + declare <8 x double> @llvm.indexed.load.v8f64 (double* , <8 x i32> , i32 ) + +Overview: +""""""""" + +Reads the vector elements from multiple memory addresses. The address of each element is specified by the base address and the corresponding index element. + + +Arguments: +"""""""""" + +The first operand is the base pointer for the load. It must be a pointer type to the loaded vector element. +The second operand is the index vector whose element type is always 'i32'. It must be the same length as the loaded vector. +The third operand is the alignment of the source location. It is always 'i32' type. + +The index must be a constant vector. The alignment must be a constant. + +Semantics: +"""""""""" + +The '``llvm.indexed.load``' intrinsic is designed for reading vector elements from multiple addresses in a single IR operation. +It can be used as interleaved load and strided load. + + +Examples: +""""""""" + +.. code-block:: llvm + + %res = call <8 x float> @llvm.indexed.load.v8f32 (float* %ptr, <8 x i32> , i32 4) + + ;; The result of the following instructions is identical aside from potential memory access exception + %ptr0 = getelementptr float, float* %ptr, i32 idx0 ; Address for lane 0 + %ptr1 = getelementptr float, float* %ptr, i32 idx1 ; Address for lane 1 + ... + %ptr7 = getelementptr float, float* %ptr, i32 idx7 ; Address for lane 7 + %lane0 = load float, float* %ptr0, align 4 ; Load for lane 0 + %lane1 = load float, float* %ptr1, align 4 ; Load for lane 1 + ... + %lane7 = load float, float* %ptr7, align 4 ; Load for lane 7 + %res0 = insertelement <8 x float> undef, float %lane0, i32 0 ; Insert lane 0 + %res1 = insertelement <8 x float> res0, float %lane1, i32 1 ; Insert lane 1 + ... + %res = insertelement <8 x float> res14, float %lane7, i32 7 ; Insert lane 7 + +.. _int_istore: + +'``llvm.indexed.store.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The data stored in memory is a vector of any integer or floating point data type. + +:: + + declare void @llvm.indexed.store.v16i32 (<16 x i32> , i32* , <16 x i32> , i32 ) + declare void @llvm.indexed.store.v8f64 (<8 x double> , double* , <8 x i32> , i32 ) + +Overview: +""""""""" + +Writes the vector elements to multiple memory addresses. The address of each element is specified by the the base address and corresponding index element. + +Arguments: +"""""""""" + +The first operand is the vector value to be written to memory. +The second operand is the base pointer for the store. It must be a pointer type to the stored vector element. +The third operand is the index vector whose element type is always 'i32'. It must be the same length as the stored vector. +The fourth operand is the alignment of the source location. It is always 'i32' type. + +The index must be a constant vector. The alignment must be a constant. + +Semantics: +"""""""""" + +The '``llvm.indexed.store``' intrinsic is designed for writing vector elements to multiple addresses in a single IR operation. +It can be used as interleaved store and strided store. + +Examples: +""""""""" + +.. code-block:: llvm + + call void @llvm.indexed.store.v8f32(<8 x float> %value, float* %ptr, <8 x i32> , i32 4) + + ;; The result of the following instructions is identical aside from potential memory access exceptions + %ptr0 = getelementptr float, float* %ptr, i32 idx0 ; Address for lane 0 + %ptr1 = getelementptr float, float* %ptr, i32 idx1 ; Address for lane 1 + ... + %ptr7 = getelementptr float, float* %ptr, i32 idx7 ; Address for lane 7 + %lane0 = extractelement <8 x float> %value, i32 0 ; Extract lane 0 + %lane1 = extractelement <8 x float> %value, i32 1 ; Extract lane 1 + ... + %lane7 = extractelement <8 x float> %value, i32 1 ; Extract lane 7 + store float %lane0, float* %ptr0, align 4 ; Store lane 0 + store float %lane1, float* %ptr1, align 4 ; Store lane 1 + ... + store float %lane7, float* %ptr7, align 4 ; Store lane 7 Memory Use Markers ------------------ Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -312,6 +312,11 @@ bool isLegalMaskedStore(Type *DataType, int Consecutive) const; bool isLegalMaskedLoad(Type *DataType, int Consecutive) const; + /// \brief Return true if the target works with indexed instruction of the + /// given data type and indices. + bool supportIndexedStore(Type *DataType, ArrayRef Indices) const; + bool supportIndexedLoad(Type *DataType, ArrayRef Indices) const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. @@ -542,6 +547,10 @@ int64_t Scale) = 0; virtual bool isLegalMaskedStore(Type *DataType, int Consecutive) = 0; virtual bool isLegalMaskedLoad(Type *DataType, int Consecutive) = 0; + virtual bool supportIndexedStore(Type *DataType, + ArrayRef Indices) = 0; + virtual bool supportIndexedLoad(Type *DataType, + ArrayRef Indices) = 0; virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) = 0; @@ -658,6 +667,13 @@ bool isLegalMaskedLoad(Type *DataType, int Consecutive) override { return Impl.isLegalMaskedLoad(DataType, Consecutive); } + bool supportIndexedStore(Type *DataType, + ArrayRef Indices) override { + return Impl.supportIndexedStore(DataType, Indices); + } + bool supportIndexedLoad(Type *DataType, ArrayRef Indices) override { + return Impl.supportIndexedLoad(DataType, Indices); + } int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) override { return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -217,6 +217,14 @@ bool isLegalMaskedLoad(Type *DataType, int Consecutive) { return false; } + bool supportIndexedStore(Type *DataType, ArrayRef Indices) { + return false; + } + + bool supportIndexedLoad(Type *DataType, ArrayRef Indices) { + return false; + } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Guess that all legal addressing mode are free. Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -690,6 +690,9 @@ // Masked load and store MLOAD, MSTORE, + // Indexed load and store + ILOAD, ISTORE, + /// This corresponds to the llvm.lifetime.* intrinsics. The first operand /// is the chain and the second operand is the alloca pointer. LIFETIME_START, LIFETIME_END, Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -856,6 +856,15 @@ SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, bool IsTrunc); + + // Construct a ILOAD node + SDValue getIndexedLoad(EVT VT, SDValue Chain, SDValue Ptr, SDValue Index, + EVT MemVT, MachineMemOperand *MMO, SDLoc dl); + // Construct a ISTORE node + SDValue getIndexedStore(SDValue Chain, SDValue Val, SDValue Ptr, + SDValue Index, EVT MemVT, MachineMemOperand *MMO, + SDLoc dl); + /// Construct a node to track a Value* through the backend. SDValue getSrcValue(const Value *v); Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -1151,6 +1151,8 @@ N->getOpcode() == ISD::ATOMIC_STORE || N->getOpcode() == ISD::MLOAD || N->getOpcode() == ISD::MSTORE || + N->getOpcode() == ISD::ILOAD || + N->getOpcode() == ISD::ISTORE || N->isMemIntrinsic() || N->isTargetMemoryOpcode(); } @@ -1987,6 +1989,59 @@ } }; +/// This base class is used to represent ILOAD and ISTORE nodes +class IndexedLoadStoreSDNode : public MemSDNode { + // Operands + SDUse Ops[4]; + +public: + friend class SelectionDAG; + IndexedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, + SDValue *Operands, unsigned numOperands, SDVTList VTs, + EVT MemVT, MachineMemOperand *MMO) + : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { + InitOperands(Ops, Operands, numOperands); + } + + // IndexedLoadSDNode (Chain, Ptr, Index) + // IndexedStoreSDNode (Chain, Ptr, Index, Src) + // In the both nodes address is Op1, Index is Op2. + const SDValue &getBasePtr() const { return getOperand(1); } + const SDValue &getIndex() const { return getOperand(2); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::ILOAD || N->getOpcode() == ISD::ISTORE; + } +}; + +/// This class is used to represent an ILOAD node +class IndexedLoadSDNode : public IndexedLoadStoreSDNode { +public: + friend class SelectionDAG; + IndexedLoadSDNode(unsigned Order, DebugLoc dl, SDValue *Operands, + unsigned numOperands, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : IndexedLoadStoreSDNode(ISD::ILOAD, Order, dl, Operands, numOperands, + VTs, MemVT, MMO) {} + + static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ILOAD; } +}; + +/// This class is used to represent an ISTORE node +class IndexedStoreSDNode : public IndexedLoadStoreSDNode { +public: + friend class SelectionDAG; + IndexedStoreSDNode(unsigned Order, DebugLoc dl, SDValue *Operands, + unsigned numOperands, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : IndexedLoadStoreSDNode(ISD::ISTORE, Order, dl, Operands, numOperands, + VTs, MemVT, MMO) {} + + const SDValue &getValue() const { return getOperand(3); } + + static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ISTORE; } +}; + /// An SDNode that represents everything that will be needed /// to construct a MachineInstr. These nodes are created during the /// instruction selection proper phase. Index: include/llvm/IR/Intrinsics.h =================================================================== --- include/llvm/IR/Intrinsics.h +++ include/llvm/IR/Intrinsics.h @@ -77,7 +77,7 @@ Void, VarArg, MMX, Metadata, Half, Float, Double, Integer, Vector, Pointer, Struct, Argument, ExtendArgument, TruncArgument, HalfVecArgument, - SameVecWidthArgument, PtrToArgument, VecOfPtrsToElt + SameVecWidthArgument, PtrToArgument, VecOfPtrsToElt, PtrToVecElt } Kind; union { @@ -100,14 +100,14 @@ assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || Kind == SameVecWidthArgument || Kind == PtrToArgument || - Kind == VecOfPtrsToElt); + Kind == VecOfPtrsToElt || Kind == PtrToVecElt); return Argument_Info >> 3; } ArgKind getArgumentKind() const { assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || Kind == SameVecWidthArgument || Kind == PtrToArgument || - Kind == VecOfPtrsToElt); + Kind == VecOfPtrsToElt || Kind == PtrToVecElt); return (ArgKind)(Argument_Info & 7); } Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -118,6 +118,7 @@ } class LLVMPointerTo : LLVMMatchType; class LLVMVectorOfPointersToElt : LLVMMatchType; +class LLVMPointerToVectorElt : LLVMMatchType; // Match the type of another intrinsic parameter that is expected to be a // vector type, but change the element count to be half as many @@ -608,6 +609,20 @@ LLVMVectorSameWidth<0, llvm_i1_ty>], [IntrReadWriteArgMem]>; +//===--------------------- Indexed load/store Intrinsics ------------------===// +// +def int_indexed_load : Intrinsic<[llvm_anyvector_ty], + [LLVMPointerToVectorElt<0>, + LLVMVectorSameWidth<0, llvm_i32_ty>, + llvm_i32_ty], + [IntrReadArgMem]>; + +def int_indexed_store : Intrinsic<[], + [llvm_anyvector_ty, LLVMPointerToVectorElt<0>, + LLVMVectorSameWidth<0, llvm_i32_ty>, + llvm_i32_ty], + [IntrReadWriteArgMem]>; + // Intrinsics to support bit sets. def int_bitset_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty], [IntrNoMem]>; Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -115,6 +115,16 @@ return TTIImpl->isLegalMaskedLoad(DataType, Consecutive); } +bool TargetTransformInfo::supportIndexedStore( + Type *DataType, ArrayRef Indices) const { + return TTIImpl->supportIndexedStore(DataType, Indices); +} + +bool TargetTransformInfo::supportIndexedLoad(Type *DataType, + ArrayRef Indices) const { + return TTIImpl->supportIndexedLoad(DataType, Indices); +} + int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -1251,6 +1251,91 @@ CI->eraseFromParent(); } +// Translate index load intrinsic, like +// <4 x i32> @llvm.indexed.load( %, +// i32 align) +// to scalar loads and insertelements: +// %ptr0 = getelementptr i32, i32 *%ptr, i32 %idx0 +// %lane0 = load i32, i32 *%ptr0 ; Load lane 0 +// %res0 = insertelement <4 x i32> undef, i32 %lane0, i32 0 ; Insert lane 0 +// %ptr1 = getelementptr i32, i32 *%ptr, i32 %idx1 +// %lane1 = load i32, i32 *%ptr1 ; Load lane 1 +// %res1 = insertelement <4 x i32> %res0, i32 %lane1, i32 1 ; Insert lane 1 +// %ptr2 = getelementptr i32, i32 *%ptr, i32 %idx2 +// %lane2 = load i32, i32 *%ptr2 ; Load lane 2 +// %res2 = insertelement <4 x i32> %res0, i32 %lane2, i32 1 ; Insert lane 2 +// %ptr3 = getelementptr i32, i32 *%ptr, i32 %idx3 +// %lane3 = load i32, i32 *%ptr3 ; Load lane 3 +// %res = insertelement <4 x i32> %res2, i32 %lane3, i32 3 ; Insert lane 3 +static void ScalarizeIndexedLoad(CallInst *CI) { + const Constant *CIdx = dyn_cast(CI->getArgOperand(1)); + assert(CIdx && (isa(CIdx) || isa(CIdx)) && + "Expect a constant index vector"); + Value *Ptr = CI->getArgOperand(0); + ConstantInt *Alignment = dyn_cast(CI->getArgOperand(2)); + assert(Alignment && "The alignment must be a constant"); + unsigned Align = Alignment->getZExtValue(); + + VectorType *VecTy = dyn_cast(CI->getType()); + Type *EltTy = VecTy->getElementType(); + Type *PtrTy = EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + IRBuilder<> Builder(CI); + Ptr = Builder.CreateBitCast(Ptr, PtrTy); + Value *Result = UndefValue::get(VecTy); + for (unsigned i = 0; i < VecTy->getNumElements(); i++) { + ConstantInt *IdxElt = dyn_cast(CIdx->getAggregateElement(i)); + assert(IdxElt && "Expect a constant index element"); + Value *EltPtr = Builder.CreateGEP(Ptr, IdxElt); + Value *Elt = Builder.CreateAlignedLoad(EltPtr, Align); + Result = Builder.CreateInsertElement(Result, Elt, Builder.getInt32(i)); + } + + CI->replaceAllUsesWith(Result); + CI->eraseFromParent(); +} + +// Translate index load intrinsic, like +// void @llvm.indexed.store(i32* %ptr, <4 x i32> %vec, +// <4 x i32> %, i32 align) +// to extractelements and scalar stores: +// %ptr0 = getelementptr i32, i32* %ptr, i32 %idx0 +// %lane0 = extractelement <4 x i32> %vec, i32 0 ; Extract lane 0 +// store i32 %lane0, i32* %ptr0 ; Store lane 0 +// %ptr1 = getelementptr i32, i32* %ptr, i32 %idx1 +// %lane1 = extractelement <4 x i32> %vec, i32 1 ; Extract lane 1 +// store i32 %lane1, i32* %ptr1 ; Store lane 1 +// %ptr2 = getelementptr i32, i32* %ptr, i32 %idx2 +// %lane2 = extractelement <4 x i32> %vec, i32 1 ; Extract lane 2 +// store i32 %lane2, i32* %ptr2 ; Store lane 2 +// %ptr3 = getelementptr i32, i32* %ptr, i32 %idx3 +// %lane3 = extractelement <4 x i32> %vec, i32 3 ; Extract lane 3 +// store i32 %lane3, i32* %ptr3 ; Store lane 3 +static void ScalarizeIndexedStore(CallInst *CI) { + const Constant *CIdx = dyn_cast(CI->getArgOperand(2)); + assert(CIdx && (isa(CIdx) || isa(CIdx)) && + "Expect a constant index vector"); + Value *VecVal = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + ConstantInt *Alignment = dyn_cast(CI->getArgOperand(3)); + assert(Alignment && "The alignment must be a constant"); + unsigned Align = Alignment->getZExtValue(); + + VectorType *VecTy = dyn_cast(VecVal->getType()); + Type *EltTy = VecTy->getElementType(); + Type *PtrTy = EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + IRBuilder<> Builder(CI); + Ptr = Builder.CreateBitCast(Ptr, PtrTy); + for (unsigned i = 0; i < VecTy->getNumElements(); i++) { + ConstantInt *IdxElt = dyn_cast(CIdx->getAggregateElement(i)); + assert(IdxElt && "Expect a constant index element"); + Value *Elt = Builder.CreateExtractElement(VecVal, Builder.getInt32(i)); + Value *EllPtr = Builder.CreateGEP(Ptr, IdxElt); + Builder.CreateAlignedStore(Elt, EllPtr, Align); + } + + CI->eraseFromParent(); +} + bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1362,6 +1447,52 @@ } return false; } + case Intrinsic::indexed_load: { + SmallVector Indices; + Constant *CIdx = dyn_cast(CI->getArgOperand(1)); + assert(CIdx && + (isa(CIdx) || isa(CIdx)) && + "Expect a constant index vector"); + unsigned NumElts = CIdx->getType()->getVectorNumElements(); + for (unsigned i = 0; i < NumElts; i++) { + ConstantInt *IdxElt = + dyn_cast(CIdx->getAggregateElement(i)); + assert(IdxElt && "Expect a constant index element"); + Indices.push_back(IdxElt->getZExtValue()); + } + + if (!TTI->supportIndexedLoad(CI->getType(), Indices)) { + // TODO: Some llvm.indexed.load can be optimized by vector load and + // shufflevector. + ScalarizeIndexedLoad(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::indexed_store: { + SmallVector Indices; + Constant *CIdx = dyn_cast(CI->getArgOperand(2)); + assert(CIdx && + (isa(CIdx) || isa(CIdx)) && + "Expect a constant index vector"); + unsigned NumElts = CIdx->getType()->getVectorNumElements(); + for (unsigned i = 0; i < NumElts; i++) { + ConstantInt *IdxElt = + dyn_cast(CIdx->getAggregateElement(i)); + assert(IdxElt && "Expect a constant index element"); + Indices.push_back(IdxElt->getZExtValue()); + } + + if (!TTI->supportIndexedStore(CI->getArgOperand(0)->getType(), Indices)) { + // TODO: Some llvm.indexed.store can be optimized by shufflevector and + // vector store. + ScalarizeIndexedStore(CI); + ModifiedDT = true; + return true; + } + return false; + } } if (TLI) { Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5082,6 +5082,56 @@ return SDValue(N, 0); } +SDValue SelectionDAG::getIndexedLoad(EVT VT, SDValue Chain, SDValue Ptr, + SDValue Index, EVT MemVT, + MachineMemOperand *MMO, SDLoc dl) { + assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); + SDVTList VTs = getVTList(VT, MVT::Other); + SDValue Ops[] = {Chain, Ptr, Index}; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::ILOAD, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED, + MMO->isVolatile(), MMO->isNonTemporal(), + MMO->isInvariant())); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + void *IP = nullptr; + if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + SDNode *N = new (NodeAllocator) IndexedLoadSDNode( + dl.getIROrder(), dl.getDebugLoc(), Ops, 3, VTs, MemVT, MMO); + CSEMap.InsertNode(N, IP); + InsertNode(N); + return SDValue(N, 0); +} + +SDValue SelectionDAG::getIndexedStore(SDValue Chain, SDValue Val, SDValue Ptr, + SDValue Index, EVT MemVT, + MachineMemOperand *MMO, SDLoc dl) { + assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); + EVT VT = Val.getValueType(); + SDVTList VTs = getVTList(MVT::Other); + SDValue Ops[] = {Chain, Ptr, Index, Val}; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::ISTORE, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), + MMO->isNonTemporal(), MMO->isInvariant())); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + void *IP = nullptr; + if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + SDNode *N = new (NodeAllocator) IndexedStoreSDNode( + dl.getIROrder(), dl.getDebugLoc(), Ops, 4, VTs, MemVT, MMO); + CSEMap.InsertNode(N, IP); + InsertNode(N); + return SDValue(N, 0); +} + SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue SV, Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -774,6 +774,8 @@ void visitStore(const StoreInst &I); void visitMaskedLoad(const CallInst &I); void visitMaskedStore(const CallInst &I); + void visitIndexedLoad(const CallInst &I); + void visitIndexedStore(const CallInst &I); void visitAtomicCmpXchg(const AtomicCmpXchgInst &I); void visitAtomicRMW(const AtomicRMWInst &I); void visitFence(const FenceInst &I); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3705,6 +3705,65 @@ setValue(&I, Load); } +void SelectionDAGBuilder::visitIndexedStore(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + + // llvm.indexed.store.*(Src, Ptr, Index, alignemt) + Value *PtrOperand = I.getArgOperand(1); + SDValue Ptr = getValue(PtrOperand); + SDValue Src = getValue(I.getArgOperand(0)); + SDValue Index = getValue(I.getArgOperand(2)); + EVT VT = Src.getValueType(); + unsigned Alignment = (cast(I.getArgOperand(3)))->getZExtValue(); + if (!Alignment) + Alignment = DAG.getEVTAlignment(VT); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, + VT.getStoreSize(), Alignment, AAInfo); + SDValue StoreNode = + DAG.getIndexedStore(getRoot(), Src, Ptr, Index, VT, MMO, sdl); + DAG.setRoot(StoreNode); + setValue(&I, StoreNode); +} + +void SelectionDAGBuilder::visitIndexedLoad(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + // @llvm.indexed.load.*(Ptr, Index, alignment) + Value *PtrOperand = I.getArgOperand(0); + SDValue Ptr = getValue(PtrOperand); + SDValue Index = getValue(I.getArgOperand(1)); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = TLI.getValueType(I.getType()); + unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue(); + if (!Alignment) + Alignment = DAG.getEVTAlignment(VT); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); + + SDValue InChain = DAG.getRoot(); + if (AA->pointsToConstantMemory(AliasAnalysis::Location( + PtrOperand, AA->getTypeStoreSize(I.getType()), AAInfo))) { + // Do not serialize (non-volatile) loads of constant memory with anything. + InChain = DAG.getEntryNode(); + } + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, + VT.getStoreSize(), Alignment, AAInfo, Ranges); + + SDValue Load = DAG.getIndexedLoad(VT, InChain, Ptr, Index, VT, MMO, sdl); + SDValue OutChain = Load.getValue(1); + DAG.setRoot(OutChain); + setValue(&I, Load); +} + void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering SuccessOrder = I.getSuccessOrdering(); @@ -4864,6 +4923,12 @@ case Intrinsic::masked_store: visitMaskedStore(I); return nullptr; + case Intrinsic::indexed_load: + visitIndexedLoad(I); + return nullptr; + case Intrinsic::indexed_store: + visitIndexedStore(I); + return nullptr; case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -273,6 +273,8 @@ case ISD::STORE: return "store"; case ISD::MLOAD: return "masked_load"; case ISD::MSTORE: return "masked_store"; + case ISD::ILOAD: return "indexed_load"; + case ISD::ISTORE: return "indexed_store"; case ISD::VAARG: return "vaarg"; case ISD::VACOPY: return "vacopy"; case ISD::VAEND: return "vaend"; Index: lib/IR/Function.cpp =================================================================== --- lib/IR/Function.cpp +++ lib/IR/Function.cpp @@ -548,10 +548,10 @@ IIT_HALF_VEC_ARG = 29, IIT_SAME_VEC_WIDTH_ARG = 30, IIT_PTR_TO_ARG = 31, - IIT_VEC_OF_PTRS_TO_ELT = 32 + IIT_VEC_OF_PTRS_TO_ELT = 32, + IIT_PTR_TO_VEC_ELT = 33 }; - static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, SmallVectorImpl &OutputTable) { IIT_Info Info = IIT_Info(Infos[NextElt++]); @@ -674,6 +674,12 @@ ArgInfo)); return; } + case IIT_PTR_TO_VEC_ELT: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back( + IITDescriptor::get(IITDescriptor::PtrToVecElt, ArgInfo)); + return; + } case IIT_EMPTYSTRUCT: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0)); return; @@ -802,6 +808,14 @@ return VectorType::get(PointerType::getUnqual(EltTy), VTy->getNumElements()); } + case IITDescriptor::PtrToVecElt: { + Type *Ty = Tys[D.getArgumentNumber()]; + VectorType *VTy = dyn_cast(Ty); + if (!VTy) + llvm_unreachable("Expected an argument of Vector Type"); + Type *EltTy = VTy->getVectorElementType(); + return PointerType::getUnqual(EltTy); + } } llvm_unreachable("unhandled"); } Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -3042,6 +3042,18 @@ return (!(ThisArgEltTy->getElementType() == ReferenceType->getVectorElementType())); } + case IITDescriptor::PtrToVecElt: { + if (D.getArgumentNumber() >= ArgTys.size()) + return true; + VectorType *ReferenceType = + dyn_cast(ArgTys[D.getArgumentNumber()]); + if (!ReferenceType) + return true; + PointerType *ThisArgType = dyn_cast(Ty); + return (!ThisArgType || + ThisArgType->getPointerElementType() != + ReferenceType->getElementType()); + } } llvm_unreachable("unhandled"); } Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -495,6 +495,8 @@ setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::ILOAD); + setTargetDAGCombine(ISD::ISTORE); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -8145,6 +8147,113 @@ return SDValue(); } +static unsigned getLdNStNIntrinsicID(unsigned NumVec, bool IsLoad) { + static unsigned LoadInt[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + static unsigned StoreInt[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + + return IsLoad ? LoadInt[NumVec - 2] : StoreInt[NumVec - 2]; +} + +// Check if the given indices are interleaved by N (N = 2,3,4). +bool static isInterleavedIndices(ArrayRef Indices, unsigned &NumVec, + unsigned &NumElts) { + if (Indices.size() <= 2) + return false; + if (Indices[0] != 0) + return false; + NumVec = Indices[1]; + if (NumVec < 2 || NumVec > 4) + return false; + + NumElts = Indices.size() / NumVec; + // The index should match: 0, NumVec, 2*NumVec, ..., 1, NumVec + 1, ... + for (unsigned i = 0; i < NumVec; i++) + for (unsigned j = 0; j < NumElts; j++) + if (Indices[j + i * NumElts] != j * NumVec + i) + return false; + + return true; +} + +static SDValue +performIndexedLoadStoreCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + bool IsLoad = N->getOpcode() == ISD::ILOAD; + // VecVal = ILOAD (Chain, Pointer, index) + // ISTORE (Chain, Pointer, Index, VecVal) + SDNode *IdxNode = N->getOperand(2).getNode(); // Indexed Node + SmallVector Indices; + for (unsigned i = 0; i < IdxNode->getNumOperands(); i++) { + ConstantSDNode *IdxElt = + dyn_cast(IdxNode->getOperand(i).getNode()); + assert(IdxElt && "Expect a constant index element"); + Indices.push_back(IdxElt->getZExtValue()); + } + + unsigned NumVec, NumElts; + if (!isInterleavedIndices(Indices, NumVec, NumElts)) + return SDValue(); + + // For store, get the stored vector type. For load, get the result type. + EVT VT = + IsLoad ? N->getValueType(0) : N->getOperand(3).getNode()->getValueType(0); + EVT ValVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); + if (!DAG.getTargetLoweringInfo().isTypeLegal(ValVT)) + return SDValue(); + + // Build the operand list. + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // The Chain + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc DL(N); + // Push the intrinsic ID for ldN stN. + Ops.push_back(DAG.getTargetConstant(getLdNStNIntrinsicID(NumVec, IsLoad), + TLI.getPointerTy())); + if (!IsLoad) { + SDValue StoreVec = N->getOperand(3); + for (unsigned i = 0; i < NumVec; i++) { + SDValue ValVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValVT, StoreVec, + DAG.getConstant(0 + i * NumElts, MVT::i64)); + Ops.push_back(ValVec); // The stored vectors + } + } + Ops.push_back(N->getOperand(1)); // The pointer + + EVT Tys[4]; + unsigned n; + unsigned NumRetVecs = IsLoad ? NumVec : 0; + for (n = 0; n < NumRetVecs; ++n) + Tys[n] = ValVT; + Tys[n] = MVT::Other; // Type of the chain + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumRetVecs + 1)); + + IndexedLoadStoreSDNode *MemNode = cast(N); + unsigned NewOp = IsLoad ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID; + SDValue NewNode = DAG.getMemIntrinsicNode( + NewOp, DL, SDTys, Ops, MemNode->getMemoryVT(), MemNode->getMemOperand()); + + if (!IsLoad) + return NewNode; + + SDValue ResVec; + SDValue Res[4]; + for (unsigned i = 0; i < NumVec; i++) + Res[i] = SDValue(NewNode.getNode(), i); + ResVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, makeArrayRef(Res, NumVec)); + // Replace the result + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), ResVec); + // Replace the Chain + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), + SDValue(NewNode.getNode(), NumVec)); + + return SDValue(); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -8703,6 +8812,9 @@ return performVSelectCombine(N, DCI.DAG); case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); + case ISD::ILOAD: + case ISD::ISTORE: + return performIndexedLoadStoreCombine(N, DCI, DAG); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::CSEL: @@ -8739,6 +8851,7 @@ default: break; } + break; } return SDValue(); } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,6 +139,8 @@ bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); + bool supportIndexedStore(Type *DataType, ArrayRef Indices); + bool supportIndexedLoad(Type *DataType, ArrayRef Indices); /// @} }; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -407,6 +407,55 @@ return LT.first; } +// Check if the given indices are interleaved by N (N = 2,3,4). +bool static isInterleavedIndices(ArrayRef Indices, unsigned &NumVec, + unsigned &NumElts) { + if (Indices.size() <= 2) + return false; + if (Indices[0] != 0) + return false; + NumVec = Indices[1]; + if (NumVec < 2 || NumVec > 4) + return false; + + NumElts = Indices.size() / NumVec; + // The index should match: 0, NumVec, 2*NumVec, ..., 1, NumVec + 1, ... + for (unsigned i = 0; i < NumVec; i++) + for (unsigned j = 0; j < NumElts; j++) + if (Indices[j + i * NumElts] != j * NumVec + i) + return false; + + return true; +} + +bool AArch64TTIImpl::supportIndexedStore(Type *DataType, + ArrayRef Indices) { + unsigned NumVec, NumElts; + if (!isInterleavedIndices(Indices, NumVec, NumElts)) + return false; + + VectorType *VecType = dyn_cast(DataType); + assert(VecType && VecType->getNumElements() == NumVec * NumElts && + "Expected a vector type"); + + VectorType *ValVec = VectorType::get(VecType->getElementType(), NumElts); + return isTypeLegal(ValVec); +} + +bool AArch64TTIImpl::supportIndexedLoad(Type *DataType, + ArrayRef Indices) { + unsigned NumVec, NumElts; + if (!isInterleavedIndices(Indices, NumVec, NumElts)) + return false; + + VectorType *VecType = dyn_cast(DataType); + assert(VecType && VecType->getNumElements() == NumVec * NumElts && + "Expected a vector type"); + + VectorType *ValVec = VectorType::get(VecType->getElementType(), NumElts); + return isTypeLegal(ValVec); +} + unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { unsigned Cost = 0; for (auto *I : Tys) { Index: test/CodeGen/AArch64/indexed-load-store-noninterleaved.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/indexed-load-store-noninterleaved.ll @@ -0,0 +1,76 @@ +; RUN: llc -print-after codegenprepare < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnueabi" + +; CHECK-LABEL: @test_v4i32(i32* %ptr) { +; CHECK: load i32 +; CHECK: insertelement <4 x i32> {{.*}}, i32 0 +; CHECK: getelementptr {{.*}}, i32 1 +; CHECK: load i32 +; CHECK: insertelement <4 x i32> {{.*}}, i32 1 +; CHECK: getelementptr {{.*}}, i32 2 +; CHECK: load i32 +; CHECK: insertelement <4 x i32> {{.*}}, i32 2 +; CHECK: getelementptr {{.*}}, i32 3 +; CHECK: load i32 +; CHECK: insertelement <4 x i32> {{.*}}, i32 3 + +; CHECK: extractelement <4 x i32> {{.*}}, i32 0 +; CHECK: getelementptr {{.*}}, i32 3 +; CHECK: store i32 +; CHECK: extractelement <4 x i32> {{.*}}, i32 1 +; CHECK: getelementptr {{.*}}, i32 2 +; CHECK: store i32 +; CHECK: extractelement <4 x i32> {{.*}}, i32 2 +; CHECK: getelementptr {{.*}}, i32 1 +; CHECK: store i32 +; CHECK: extractelement <4 x i32> {{.*}}, i32 3 +; CHECK: store i32 + +define void @test_v4i32(i32* %ptr) { +entry: + %indexed.load = call <4 x i32> @llvm.indexed.load.v4i32(i32* %ptr, <4 x i32> , i32 4) + %0 = add nsw <4 x i32> %indexed.load, + call void @llvm.indexed.store.v4i32(<4 x i32> %0, i32* %ptr, <4 x i32> , i32 4) + ret void +} + +; CHECK-LABEL: @test_v4f32(float* %ptr) { +; CHECK: load float +; CHECK: insertelement <4 x float> {{.*}}, i32 0 +; CHECK: getelementptr {{.*}}, i32 2 +; CHECK: load float +; CHECK: insertelement <4 x float> {{.*}}, i32 1 +; CHECK: getelementptr {{.*}}, i32 4 +; CHECK: load float +; CHECK: insertelement <4 x float> {{.*}}, i32 2 +; CHECK: getelementptr {{.*}}, i32 6 +; CHECK: load float +; CHECK: insertelement <4 x float> {{.*}}, i32 3 + +; CHECK: extractelement <4 x float> {{.*}}, i32 0 +; CHECK: store float +; CHECK: extractelement <4 x float> {{.*}}, i32 1 +; CHECK: getelementptr {{.*}}, i32 2 +; CHECK: store float +; CHECK: extractelement <4 x float> {{.*}}, i32 2 +; CHECK: getelementptr {{.*}}, i32 4 +; CHECK: store float +; CHECK: extractelement <4 x float> {{.*}}, i32 3 +; CHECK: getelementptr {{.*}}, i32 6 +; CHECK: store float + +define void @test_v4f32(float* %ptr) { +entry: + %indexed.load = call <4 x float> @llvm.indexed.load.v4f32(float* %ptr, <4 x i32> , i32 4) + %0 = fadd <4 x float> %indexed.load, + call void @llvm.indexed.store.v4f32(<4 x float> %0, float* %ptr, <4 x i32> , i32 4) + ret void +} + +declare <4 x i32> @llvm.indexed.load.v4i32(i32*, <4 x i32>, i32) +declare void @llvm.indexed.store.v4i32(<4 x i32>, i32*, <4 x i32>, i32) +declare <4 x float> @llvm.indexed.load.v4f32(float*, <4 x i32>, i32) +declare void @llvm.indexed.store.v4f32(<4 x float>, float*, <4 x i32>, i32) Index: test/CodeGen/AArch64/interleaved-load-store.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/interleaved-load-store.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnueabi" + +; Make sure the intrinsic about 2 interleaved vectors can be matched +; CHECK-LABEL: test_ld2_st2: +; CHECK: ld2 +; CHECK: st2 + +define void @test_ld2_st2(i32* %ptr) { +entry: + %interleave.load = call <8 x i32> @llvm.indexed.load.v8i32(i32* %ptr, <8 x i32> , i32 4) + %0 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> + %1 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> + %2 = add nsw <4 x i32> %0, + %3 = add nsw <4 x i32> %1, + %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> + call void @llvm.indexed.store.v8i32(<8 x i32> %4, i32* %ptr, <8 x i32> , i32 4) + ret void +} + +; Make sure the intrinsic about 3 interleaved vectors can be matched +; CHECK-LABEL: test_ld3_st3: +; CHECK: ld3 +; CHECK: st3 + +define void @test_ld3_st3(float* %ptr) { +entry: + %interleave.load = call <12 x float> @llvm.indexed.load.v12f32(float* %ptr, <12 x i32> , i32 4) + %0 = shufflevector <12 x float> %interleave.load, <12 x float> undef, <4 x i32> + %1 = shufflevector <12 x float> %interleave.load, <12 x float> undef, <4 x i32> + %2 = shufflevector <12 x float> %interleave.load, <12 x float> undef, <4 x i32> + %3 = fadd <4 x float> %0, + %4 = fadd <4 x float> %1, + %5 = fadd <4 x float> %2, + %6 = shufflevector <4 x float> %3, <4 x float> %4, <8 x i32> + %7 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> + %8 = shufflevector <8 x float> %6, <8 x float> %7, <12 x i32> + call void @llvm.indexed.store.v12f32(<12 x float> %8, float* %ptr, <12 x i32> , i32 4) + ret void +} + +; Make sure the intrinsic about 3 interleaved vectors can be matched +; CHECK-LABEL: test_ld4_st4: +; CHECK: ld4 +; CHECK: st4 + +define void @test_ld4_st4(i64* %ptr) { +entry: + %interleave.load = call <8 x i64> @llvm.indexed.load.v8i64(i64* %ptr, <8 x i32> , i32 4) + %0 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> + %1 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> + %2 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> + %3 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> + %4 = add nsw <2 x i64> %0, + %5 = add nsw <2 x i64> %1, + %6 = add nsw <2 x i64> %2, + %7 = add nsw <2 x i64> %3, + %8 = shufflevector <2 x i64> %4, <2 x i64> %5, <4 x i32> + %9 = shufflevector <2 x i64> %6, <2 x i64> %7, <4 x i32> + %10 = shufflevector <4 x i64> %8, <4 x i64> %9, <8 x i32> + call void @llvm.indexed.store.v8i64(<8 x i64> %10, i64* %ptr, <8 x i32> , i32 4) + ret void +} + + +declare <8 x i32> @llvm.indexed.load.v8i32(i32*, <8 x i32>, i32) +declare void @llvm.indexed.store.v8i32(<8 x i32>, i32*, <8 x i32>, i32) +declare <12 x float> @llvm.indexed.load.v12f32(float*, <12 x i32>, i32) +declare void @llvm.indexed.store.v12f32(<12 x float>, float*, <12 x i32>, i32) +declare <8 x i64> @llvm.indexed.load.v8i64(i64*, <8 x i32>, i32) +declare void @llvm.indexed.store.v8i64(<8 x i64>, i64*, <8 x i32>, i32) Index: utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- utils/TableGen/IntrinsicEmitter.cpp +++ utils/TableGen/IntrinsicEmitter.cpp @@ -260,10 +260,10 @@ IIT_HALF_VEC_ARG = 29, IIT_SAME_VEC_WIDTH_ARG = 30, IIT_PTR_TO_ARG = 31, - IIT_VEC_OF_PTRS_TO_ELT = 32 + IIT_VEC_OF_PTRS_TO_ELT = 32, + IIT_PTR_TO_VEC_ELT = 33 }; - static void EncodeFixedValueType(MVT::SimpleValueType VT, std::vector &Sig) { if (MVT(VT).isInteger()) { @@ -319,6 +319,8 @@ Sig.push_back(IIT_PTR_TO_ARG); else if (R->isSubClassOf("LLVMVectorOfPointersToElt")) Sig.push_back(IIT_VEC_OF_PTRS_TO_ELT); + else if (R->isSubClassOf("LLVMPointerToVectorElt")) + Sig.push_back(IIT_PTR_TO_VEC_ELT); else Sig.push_back(IIT_ARG); return Sig.push_back((Number << 3) | ArgCodes[Number]);