Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -270,6 +270,13 @@ int64_t BaseOffset, bool HasBaseReg, int64_t Scale) const; + /// \brief Return true if the target works with masked instruction + /// AVX2 allows masks for consecutive load and store for i32 and i64 elements. + /// AVX-512 architecture will also allow masks for non-consecutive memory + /// accesses. + virtual bool isLegalPredicatedStore(Type *DataType, int Consecutive) const; + virtual bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -675,6 +675,9 @@ ATOMIC_LOAD_UMIN, ATOMIC_LOAD_UMAX, + // Masked load and store + MLOAD, MSTORE, + /// This corresponds to the llvm.lifetime.* intrinsics. The first operand /// is the chain and the second operand is the alloca pointer. LIFETIME_START, LIFETIME_END, Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -862,6 +862,10 @@ SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM); + SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, + SDValue Mask, SDValue Src0, MachineMemOperand *MMO); + SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, + SDValue Ptr, SDValue Mask, MachineMemOperand *MMO); /// getSrcValue - Construct a node to track a Value* through the backend. SDValue getSrcValue(const Value *v); Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -1177,6 +1177,8 @@ N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE || + N->getOpcode() == ISD::MLOAD || + N->getOpcode() == ISD::MSTORE || N->isMemIntrinsic() || N->isTargetMemoryOpcode(); } @@ -1926,6 +1928,79 @@ } }; + +/// MaskedLoadSDNode - This class is used to represent ISD::MLOAD nodes. +/// +class MaskedLoadStoreSDNode : public MemSDNode { + + // Operands + SDUse Ops[4]; +public: + friend class SelectionDAG; + MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, + SDValue *Operands, unsigned numOperands, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { + InitOperands(Ops, Operands, numOperands); + } + //const SDValue &getOffset() const { + // return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3); + //} + + const SDValue &getBasePtr() const { + return getOperand(getOpcode() == ISD::MLOAD ? 1 : 2); + } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::MLOAD || + N->getOpcode() == ISD::MSTORE; + } +}; + +/// MaskedLoadSDNode - This class is used to represent ISD::MLOAD nodes. +/// +class MaskedLoadSDNode : public MaskedLoadStoreSDNode { + +public: + friend class SelectionDAG; + MaskedLoadSDNode(unsigned Order, DebugLoc dl, + SDValue *Operands, unsigned numOperands, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands, + VTs, MemVT, MMO) + {} + + const SDValue &getBasePtr() const { return getOperand(1); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getSrc0() const { return getOperand(3); } + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::MLOAD; + } +}; + +/// MaskedStoreSDNode - This class is used to represent ISD::MLOAD nodes. +/// +class MaskedStoreSDNode : public MaskedLoadStoreSDNode { + +public: + friend class SelectionDAG; + MaskedStoreSDNode(unsigned Order, DebugLoc dl, + SDValue *Operands, unsigned numOperands, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands, + VTs, MemVT, MMO) + {} + + const SDValue &getBasePtr() const { return getOperand(1); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getData() const { return getOperand(3); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::MSTORE; + } +}; + + /// MachineSDNode - An SDNode that represents everything that will be needed /// to construct a MachineInstr. These nodes are created during the /// instruction selection proper phase. Index: include/llvm/IR/IRBuilder.h =================================================================== --- include/llvm/IR/IRBuilder.h +++ include/llvm/IR/IRBuilder.h @@ -429,6 +429,11 @@ /// If the pointer isn't i8* it will be converted. CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr); + /// \brief Create a call to a masked intrinsic with given Id. + /// Masked intrinsic has only one overloaded type - data type. + CallInst *CreateMaskedIntrinsic(unsigned Id, ArrayRef Ops, + Type *DataTy); + /// \brief Create an assume intrinsic call that allows the optimizer to /// assume that the provided condition will be true. CallInst *CreateAssumption(Value *Cond); Index: include/llvm/IR/Intrinsics.h =================================================================== --- include/llvm/IR/Intrinsics.h +++ include/llvm/IR/Intrinsics.h @@ -76,7 +76,8 @@ enum IITDescriptorKind { Void, VarArg, MMX, Metadata, Half, Float, Double, Integer, Vector, Pointer, Struct, - Argument, ExtendArgument, TruncArgument, HalfVecArgument + Argument, ExtendArgument, TruncArgument, HalfVecArgument, + SameVecWidthArgument } Kind; union { @@ -96,13 +97,15 @@ }; unsigned getArgumentNumber() const { assert(Kind == Argument || Kind == ExtendArgument || - Kind == TruncArgument || Kind == HalfVecArgument); + Kind == TruncArgument || Kind == HalfVecArgument || + Kind == SameVecWidthArgument); return Argument_Info >> 2; } ArgKind getArgumentKind() const { assert(Kind == Argument || Kind == ExtendArgument || - Kind == TruncArgument || Kind == HalfVecArgument); - return (ArgKind)(Argument_Info&3); + Kind == TruncArgument || Kind == HalfVecArgument || + Kind == SameVecWidthArgument); + return (ArgKind)(Argument_Info & 3); } static IITDescriptor get(IITDescriptorKind K, unsigned Field) { Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -112,6 +112,10 @@ // the intrinsic is overloaded, so the matched type should be declared as iAny. class LLVMExtendedType : LLVMMatchType; class LLVMTruncatedType : LLVMMatchType; +class LLVMVectorSameWidth + : LLVMMatchType { + ValueType ElTy = elty.VT; +} // Match the type of another intrinsic parameter that is expected to be a // vector type, but change the element count to be half as many @@ -539,6 +543,17 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [], "llvm.clear_cache">; +//===-------------------------- Masked Intrinsics -------------------------===// +// +def int_masked_store : Intrinsic<[], [llvm_ptr_ty, llvm_anyvector_ty, + llvm_i32_ty, + LLVMVectorSameWidth<0, llvm_i1_ty>], + [IntrReadWriteArgMem]>; + +def int_masked_load : Intrinsic<[llvm_anyvector_ty], + [llvm_ptr_ty, LLVMMatchType<0>, llvm_i32_ty, + LLVMVectorSameWidth<0, llvm_i1_ty>], + [IntrReadArgMem]>; //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -188,6 +188,14 @@ SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3> ]>; +def SDTMaskedStore: SDTypeProfile<0, 3, [ // masked store + SDTCisPtrTy<0>, SDTCisVec<1>, SDTCisVec<2> +]>; + +def SDTMaskedLoad: SDTypeProfile<1, 3, [ // masked load + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3> +]>; + def SDTVecShuffle : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>; @@ -454,6 +462,13 @@ def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def masked_store : SDNode<"ISD::MSTORE", SDTMaskedStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def masked_load : SDNode<"ISD::MLOAD", SDTMaskedLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + + + // Do not use ld, st directly. Use load, extload, sextload, zextload, store, // and truncst (see below). def ld : SDNode<"ISD::LOAD" , SDTLoad, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -101,6 +101,17 @@ return PrevTTI->isLegalICmpImmediate(Imm); } +bool TargetTransformInfo::isLegalPredicatedLoad(Type *DataType, + int Consecutive) const { + return false; +} + +bool TargetTransformInfo::isLegalPredicatedStore(Type *DataType, + int Consecutive) const { + return false; +} + + bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -303,6 +303,8 @@ SDValue visitEXTRACT_SUBVECTOR(SDNode *N); SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitINSERT_SUBVECTOR(SDNode *N); + SDValue visitMLOAD(SDNode *N); + SDValue visitMSTORE(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS); @@ -1347,6 +1349,8 @@ case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); + case ISD::MLOAD: return visitMLOAD(N); + case ISD::MSTORE: return visitMSTORE(N); } return SDValue(); } @@ -4767,6 +4771,140 @@ TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); } +SDValue DAGCombiner::visitMSTORE(SDNode *N) { + + if (Level >= AfterLegalizeTypes) + return SDValue(); + + MaskedStoreSDNode *MST = dyn_cast(N); + SDValue Mask = MST->getMask(); + SDValue Data = MST->getData(); + SDLoc DL(N); + + if (Mask.getOpcode() == ISD::SETCC) { + + // Check if any splitting is required. + if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) != + TargetLowering::TypeSplitVector) + return SDValue(); + + SDValue MaskLo, MaskHi, Lo, Hi; + std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0)); + + SDValue Chain = MST->getChain(); + SDValue Ptr = MST->getBasePtr(); + + EVT MemoryVT = MST->getMemoryVT(); + unsigned Alignment = MST->getOriginalAlignment(); + + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + SDValue DataLo, DataHi; + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MST->getPointerInfo(), + MachineMemOperand::MOStore, LoMemVT.getStoreSize(), + Alignment, MST->getAAInfo(), MST->getRanges()); + + Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(MST->getPointerInfo(), + MachineMemOperand::MOStore, HiMemVT.getStoreSize(), + Alignment, MST->getAAInfo(), MST->getRanges()); + + Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO); + + AddToWorklist(Lo.getNode()); + AddToWorklist(Hi.getNode()); + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + } + return SDValue(); +} + +SDValue DAGCombiner::visitMLOAD(SDNode *N) { + + if (Level >= AfterLegalizeTypes) + return SDValue(); + + MaskedLoadSDNode *MLD = dyn_cast(N); + SDValue Mask = MLD->getMask(); + SDLoc DL(N); + + if (Mask.getOpcode() == ISD::SETCC) { + EVT VT = N->getValueType(0); + + // Check if any splitting is required. + if (TLI.getTypeAction(*DAG.getContext(), VT) != + TargetLowering::TypeSplitVector) + return SDValue(); + + SDValue MaskLo, MaskHi, Lo, Hi; + std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); + + SDValue Src0 = MLD->getSrc0(); + SDValue Src0Lo, Src0Hi; + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); + + SDValue Chain = MLD->getChain(); + SDValue Ptr = MLD->getBasePtr(); + EVT MemoryVT = MLD->getMemoryVT(); + unsigned Alignment = MLD->getOriginalAlignment(); + + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), + Alignment, MLD->getAAInfo(), MLD->getRanges()); + + Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), + Alignment, MLD->getAAInfo(), MLD->getRanges()); + + Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO); + + AddToWorklist(Lo.getNode()); + AddToWorklist(Hi.getNode()); + + // Build a factor node to remember that this load is independent of the + // other one. + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalized the chain result - switch anything that used the old chain to + // use the new one. + DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain); + + SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + + SDValue RetOps[] = { LoadRes, Chain }; + return DAG.getMergeValues(RetOps, DL); + } + return SDValue(); +} + SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -825,6 +825,10 @@ case ISD::SINT_TO_FP: Res = PromoteIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = PromoteIntOp_STORE(cast(N), OpNo); break; + case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast(N), + OpNo); break; + case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), + OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; @@ -1091,6 +1095,25 @@ N->getMemoryVT(), N->getMemOperand()); } +SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){ + + assert(OpNo == 2 && "Only know how to promote the mask!"); + EVT DataVT = N->getOperand(3).getValueType(); + SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = Mask; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){ + assert(OpNo == 2 && "Only know how to promote the mask!"); + EVT DataVT = N->getValueType(0); + SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = Mask; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op); Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -285,6 +285,8 @@ SDValue PromoteIntOp_TRUNCATE(SDNode *N); SDValue PromoteIntOp_UINT_TO_FP(SDNode *N); SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); + SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -568,6 +570,7 @@ void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_IntrTernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -578,6 +581,7 @@ void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -594,10 +598,12 @@ SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); + SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_TRUNCATE(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); + SDValue SplitVecOp_IntrOp(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Support: LegalizeVectorTypes.cpp Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -597,6 +597,9 @@ case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; + case ISD::MLOAD: + SplitVecRes_MLOAD(cast(N), Lo, Hi); + break; case ISD::SETCC: SplitVecRes_SETCC(N, Lo, Hi); break; @@ -979,6 +982,58 @@ ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, + SDValue &Lo, SDValue &Hi) { + EVT LoVT, HiVT; + SDLoc dl(MLD); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); + + SDValue Ch = MLD->getChain(); + SDValue Ptr = MLD->getBasePtr(); + SDValue Mask = MLD->getMask(); + unsigned Alignment = MLD->getOriginalAlignment(); + + SDValue MaskLo, MaskHi; + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + + EVT MemoryVT = MLD->getMemoryVT(); + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + SDValue Src0 = MLD->getSrc0(); + SDValue Src0Lo, Src0Hi; + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), + Alignment, MLD->getAAInfo(), MLD->getRanges()); + + Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), + Alignment, MLD->getAAInfo(), MLD->getRanges()); + + Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, MMO); + + + // Build a factor node to remember that this load is independent of the + // other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalized the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(MLD, 1), Ch); + +} + void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && @@ -1234,6 +1289,9 @@ case ISD::STORE: Res = SplitVecOp_STORE(cast(N), OpNo); break; + case ISD::MSTORE: + Res = SplitVecOp_MSTORE(cast(N), OpNo); + break; case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; @@ -1395,6 +1453,50 @@ MachinePointerInfo(), EltVT, false, false, false, 0); } +SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, + unsigned OpNo) { + SDValue Ch = N->getChain(); + SDValue Ptr = N->getBasePtr(); + SDValue Mask = N->getMask(); + SDValue Data = N->getData(); + EVT MemoryVT = N->getMemoryVT(); + unsigned Alignment = N->getOriginalAlignment(); + SDLoc DL(N); + + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + SDValue DataLo, DataHi; + GetSplitVector(Data, DataLo, DataHi); + SDValue MaskLo, MaskHi; + GetSplitVector(Mask, MaskLo, MaskHi); + + SDValue Lo, Hi; + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(N->getPointerInfo(), + MachineMemOperand::MOStore, LoMemVT.getStoreSize(), + Alignment, N->getAAInfo(), N->getRanges()); + + Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(N->getPointerInfo(), + MachineMemOperand::MOStore, HiMemVT.getStoreSize(), + Alignment, N->getAAInfo(), N->getRanges()); + + Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, MMO); + + + // Build a factor node to remember that this load is independent of the + // other one. + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + +} + SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { assert(N->isUnindexed() && "Indexed store of vector?"); assert(OpNo == 1 && "Can only split the stored value"); Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4909,6 +4909,60 @@ return SDValue(N, 0); } +SDValue +SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, + SDValue Ptr, SDValue Mask, SDValue Src0, + MachineMemOperand *MMO) { + + SDVTList VTs = getVTList(VT, MVT::Other); + SDValue Ops[] = { Chain, Ptr, Mask, Src0 }; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED, + MMO->isVolatile(), + MMO->isNonTemporal(), + MMO->isInvariant())); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + void *IP = nullptr; + if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(), + dl.getDebugLoc(), Ops, 4, VTs, + VT, MMO); + CSEMap.InsertNode(N, IP); + InsertNode(N); + return SDValue(N, 0); +} + +SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, + SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) { + assert(Chain.getValueType() == MVT::Other && + "Invalid chain type"); + EVT VT = Val.getValueType(); + SDVTList VTs = getVTList(MVT::Other); + SDValue Ops[] = { Chain, Ptr, Mask, Val }; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), + MMO->isNonTemporal(), MMO->isInvariant())); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + void *IP = nullptr; + if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(), + dl.getDebugLoc(), Ops, 4, + VTs, VT, MMO); + CSEMap.InsertNode(N, IP); + InsertNode(N); + return SDValue(N, 0); +} + SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue SV, Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -756,6 +756,8 @@ void visitAlloca(const AllocaInst &I); void visitLoad(const LoadInst &I); void visitStore(const StoreInst &I); + void visitMaskedLoad(const CallInst &I); + void visitMaskedStore(const CallInst &I); void visitAtomicCmpXchg(const AtomicCmpXchgInst &I); void visitAtomicRMW(const AtomicRMWInst &I); void visitFence(const FenceInst &I); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3613,6 +3613,72 @@ DAG.setRoot(StoreNode); } +void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + DebugLoc dl = getCurDebugLoc(); + + Value *PtrOperand = I.getArgOperand(0); + SDValue Ptr = getValue(PtrOperand); + SDValue Src0 = getValue(I.getArgOperand(1)); + SDValue Mask = getValue(I.getArgOperand(3)); + EVT VT = Src0.getValueType(); + unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue(); + if (!Alignment) + Alignment = DAG.getEVTAlignment(VT); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + + MachineMemOperand *MMO = + DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(PtrOperand), + MachineMemOperand::MOStore, VT.getStoreSize(), + Alignment, AAInfo); + SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO); + DAG.setRoot(StoreNode); + setValue(&I, StoreNode); +} + +void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + DebugLoc dl = getCurDebugLoc(); + + Value *PtrOperand = I.getArgOperand(0); + SDValue Ptr = getValue(PtrOperand); + SDValue Src0 = getValue(I.getArgOperand(1)); + SDValue Mask = getValue(I.getArgOperand(3)); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = TLI.getValueType(I.getType()); + unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue(); + if (!Alignment) + Alignment = DAG.getEVTAlignment(VT); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + const MDNode *Ranges = I.getMDNode(LLVMContext::MD_range); + + SDValue InChain = DAG.getRoot(); + if (AA->pointsToConstantMemory( + AliasAnalysis::Location(PtrOperand, + AA->getTypeStoreSize(I.getType()), + AAInfo))) { + // Do not serialize (non-volatile) loads of constant memory with anything. + InChain = DAG.getEntryNode(); + } + + MachineMemOperand *MMO = + DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(PtrOperand), + MachineMemOperand::MOLoad, VT.getStoreSize(), + Alignment, AAInfo, Ranges); + + SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO); + SDValue OutChain = Load.getValue(1); + DAG.setRoot(OutChain); + setValue(&I, Load); +} + void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering SuccessOrder = I.getSuccessOrdering(); @@ -4914,6 +4980,12 @@ return nullptr; } + case Intrinsic::masked_load: + visitMaskedLoad(I); + return nullptr; + case Intrinsic::masked_store: + visitMaskedStore(I); + return nullptr; case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -269,6 +269,8 @@ // Other operators case ISD::LOAD: return "load"; case ISD::STORE: return "store"; + case ISD::MLOAD: return "masked_load"; + case ISD::MSTORE: return "masked_store"; case ISD::VAARG: return "vaarg"; case ISD::VACOPY: return "vacopy"; case ISD::VAEND: return "vaend"; Index: lib/IR/Function.cpp =================================================================== --- lib/IR/Function.cpp +++ lib/IR/Function.cpp @@ -515,7 +515,8 @@ IIT_ANYPTR = 26, IIT_V1 = 27, IIT_VARARG = 28, - IIT_HALF_VEC_ARG = 29 + IIT_HALF_VEC_ARG = 29, + IIT_SAME_VEC_WIDTH_ARG = 30 }; @@ -623,6 +624,12 @@ ArgInfo)); return; } + case IIT_SAME_VEC_WIDTH_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument, + ArgInfo)); + return; + } case IIT_EMPTYSTRUCT: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0)); return; @@ -730,7 +737,14 @@ case IITDescriptor::HalfVecArgument: return VectorType::getHalfElementsVectorType(cast( Tys[D.getArgumentNumber()])); - } + case IITDescriptor::SameVecWidthArgument: + Type *EltTy = DecodeFixedType(Infos, Tys, Context); + Type *Ty = Tys[D.getArgumentNumber()]; + if (VectorType *VTy = dyn_cast(Ty)) { + return VectorType::get(EltTy, VTy->getNumElements()); + } + llvm_unreachable("unhandled"); + } llvm_unreachable("unhandled"); } Index: lib/IR/IRBuilder.cpp =================================================================== --- lib/IR/IRBuilder.cpp +++ lib/IR/IRBuilder.cpp @@ -173,6 +173,7 @@ return createCallHelper(TheFn, Ops, this); } + CallInst *IRBuilderBase::CreateAssumption(Value *Cond) { assert(Cond->getType() == getInt1Ty() && "an assumption condition must be of type i1"); @@ -183,3 +184,10 @@ return createCallHelper(FnAssume, Ops, this); } +CallInst *IRBuilderBase::CreateMaskedIntrinsic(unsigned Id, ArrayRef Ops, + Type *DataTy) { + Module *M = BB->getParent()->getParent(); + Type *OverloadedTypes[] = { DataTy }; + Value *TheFn = Intrinsic::getDeclaration(M, (Intrinsic::ID)Id, OverloadedTypes); + return createCallHelper(TheFn, Ops, this); +} Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -2399,6 +2399,19 @@ !isa(ArgTys[D.getArgumentNumber()]) || VectorType::getHalfElementsVectorType( cast(ArgTys[D.getArgumentNumber()])) != Ty; + case IITDescriptor::SameVecWidthArgument: { + if (D.getArgumentNumber() >= ArgTys.size()) + return true; + VectorType * ReferenceType = + dyn_cast(ArgTys[D.getArgumentNumber()]); + VectorType *ThisArgType = dyn_cast(Ty); + if (!ThisArgType || !ReferenceType || + (ReferenceType->getVectorNumElements() != + ThisArgType->getVectorNumElements())) + return true; + return VerifyIntrinsicType(ThisArgType->getVectorElementType(), + Infos, ArgTys); + } } llvm_unreachable("unhandled"); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1327,13 +1327,21 @@ // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. - if (VT.is128BitVector()) + if (VT.is128BitVector()) { + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); + } setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); @@ -1496,9 +1504,13 @@ unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. - if (VT.is128BitVector() || VT.is256BitVector()) + if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + if ( EltSize >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } + } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); @@ -1514,6 +1526,8 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2099,6 +2099,41 @@ (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), + (VMOVUPSZmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), + (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), + (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, + (bc_v16f32 (v16i32 immAllZerosV)))), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), + (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8f64 (v16i32 immAllZerosV)))), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), + (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", "16", "8", "4", SSEPackedInt, HasAVX512>, avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", @@ -2173,6 +2208,46 @@ (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), + (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), + (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), + (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), + (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +// SKX replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; + +// KNL replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Zmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + + // Move Int Doubleword to Packed Double Int // def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -9260,6 +9260,61 @@ int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), + (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), + (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), + (bc_v8f32 (v8i32 immAllZerosV)))), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), + (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), + (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (v4f64 immAllZerosV))), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (bc_v4i64 (v8i32 immAllZerosV)))), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + //===----------------------------------------------------------------------===// // Variable Bit Shifts Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -111,6 +111,8 @@ Type *Ty) const override; unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) const override; + bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const; + bool isLegalPredicatedStore(Type *DataType, int Consecutive) const; /// @} }; @@ -618,6 +620,14 @@ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, }; if (ST->hasAVX512()) { @@ -1147,3 +1157,18 @@ } return X86TTI::getIntImmCost(Imm, Ty); } + +bool X86TTI::isLegalPredicatedLoad(Type *DataType, int Consecutive) const { + int ScalarWidth = DataType->getScalarSizeInBits(); + + // Todo: AVX512 allows gather/scatter, works with strided and random as well + if ((ScalarWidth < 32) || (Consecutive == 0)) + return false; + if (ST->hasAVX512() || ST->hasAVX2()) + return true; + return false; +} + +bool X86TTI::isLegalPredicatedStore(Type *DataType, int Consecutive) const { + return isLegalPredicatedLoad(DataType, Consecutive); +} \ No newline at end of file Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -580,9 +580,10 @@ LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, DominatorTree *DT, TargetLibraryInfo *TLI, - AliasAnalysis *AA, Function *F) + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI) : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr), + DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { } @@ -768,6 +769,15 @@ } SmallPtrSet::iterator strides_end() { return StrideSet.end(); } + bool canPredicateStore(Type *DataType, Value *Ptr) { + return TTI->isLegalPredicatedStore(DataType, isConsecutivePtr(Ptr)); + } + bool canPredicateLoad(Type *DataType, Value *Ptr) { + return TTI->isLegalPredicatedLoad(DataType, isConsecutivePtr(Ptr)); + } + bool setMaskedOp(const Instruction* I) { + return (MaskedOp.find(I) != MaskedOp.end()); + } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -840,6 +850,8 @@ AliasAnalysis *AA; /// Parent function Function *TheFunction; + /// Target Transform Info + const TargetTransformInfo *TTI; // --- vectorization state --- // @@ -871,6 +883,7 @@ ValueToValueMap Strides; SmallPtrSet StrideSet; + std::set MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1375,7 +1388,7 @@ } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1763,7 +1776,8 @@ unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; - if (SI && Legal->blockNeedsPredication(SI->getParent())) + if (SI && Legal->blockNeedsPredication(SI->getParent()) && + !Legal->setMaskedOp(SI)) return scalarizeInstruction(Instr, true); if (ScalarAllocatedSize != VectorElementSize) @@ -1857,8 +1871,25 @@ Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - StoreInst *NewSI = - Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + + Instruction *NewSI; + if (Legal->setMaskedOp(SI)) { + Type *I8PtrTy = + Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace()); + + Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy); + + VectorParts Cond = createEdgeMask(SI->getParent()->getSinglePredecessor(), + SI->getParent()); + SmallVector Ops; + Ops.push_back(I8Ptr); + Ops.push_back(StoredVal[Part]); + Ops.push_back(Builder.getInt32(Alignment)); + Ops.push_back(Cond[Part]); + NewSI = Builder.CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, StoredVal[Part]->getType()); + } + else + NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); propagateMetadata(NewSI, SI); } return; @@ -1873,14 +1904,31 @@ if (Reverse) { // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. + // wide load needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); } - Value *VecPtr = Builder.CreateBitCast(PartPtr, - DataTy->getPointerTo(AddressSpace)); - LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + Instruction* NewLI; + if (Legal->setMaskedOp(LI)) { + Type *I8PtrTy = + Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace()); + + Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy); + + VectorParts SrcMask = createBlockInMask(LI->getParent()); + SmallVector Ops; + Ops.push_back(I8Ptr); + Ops.push_back(UndefValue::get(DataTy)); + Ops.push_back(Builder.getInt32(Alignment)); + Ops.push_back(SrcMask[Part]); + NewLI = Builder.CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy); + } + else { + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); + NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + } propagateMetadata(NewLI, LI); Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } @@ -5301,10 +5349,18 @@ SmallPtrSetImpl &SafePtrs) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // We might be able to hoist the load. + if (it->mayReadFromMemory()) { LoadInst *LI = dyn_cast(it); - if (!LI || !SafePtrs.count(LI->getPointerOperand())) + if (!LI) return false; + if (!SafePtrs.count(LI->getPointerOperand())) { + if (canPredicateLoad(LI->getType(), LI->getPointerOperand())) { + MaskedOp.insert(LI); + continue; + } + return false; + } } // We don't predicate stores at the moment. @@ -5312,10 +5368,20 @@ StoreInst *SI = dyn_cast(it); // We only support predication of stores in basic blocks with one // predecessor. - if (!SI || ++NumPredStores > NumberOfStoresToPredicate || + if (!SI) + return false; + + if (++NumPredStores > NumberOfStoresToPredicate || !SafePtrs.count(SI->getPointerOperand()) || - !SI->getParent()->getSinglePredecessor()) + !SI->getParent()->getSinglePredecessor()) { + if (canPredicateStore(SI->getValueOperand()->getType(), + SI->getPointerOperand())) { + MaskedOp.insert(SI); + --NumPredStores; + continue; + } return false; + } } if (it->mayThrow()) return false; @@ -5338,7 +5404,6 @@ return false; } } - return true; } @@ -5379,7 +5444,7 @@ MaxVectorSize = 1; } - assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" + assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!"); unsigned VF = MaxVectorSize; @@ -5440,7 +5505,7 @@ // the vector elements. float VectorCost = expectedCost(i) / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << - (int)VectorCost << ".\n"); + VectorCost << ".\n"); if (VectorCost < Cost) { Cost = VectorCost; Width = i; Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -0,0 +1,73 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 + +; AVX512-LABEL: test1 +; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} + +; AVX2-LABEL: test1 +; AVX2: vpmaskmovd 32(%rdi) +; AVX2: vpmaskmovd (%rdi) +; AVX2-NOT: blend + +define <16 x i32> @test1(<16 x i32> %trigger, i8* %addr) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>undef, i32 4, <16 x i1>%mask) + ret <16 x i32> %res +} + +; AVX512-LABEL: test2 +; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} + +; AVX2-LABEL: test2 +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2-NOT: blend +define <16 x i32> @test2(<16 x i32> %trigger, i8* %addr) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>zeroinitializer, i32 4, <16 x i1>%mask) + ret <16 x i32> %res +} + +; AVX512-LABEL: test3 +; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1} + +define void @test3(<16 x i32> %trigger, i8* %addr, <16 x i32> %val) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v16i32(i8* %addr, <16 x i32>%val, i32 4, <16 x i1>%mask) + ret void +} + +; AVX512-LABEL: test4 +; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}} + +; AVX2-LABEL: test4 +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: blend +define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x float> @llvm.masked.load.v16f32(i8* %addr, <16 x float>%dst, i32 4, <16 x i1>%mask) + ret <16 x float> %res +} + +; AVX512-LABEL: test5 +; AVX512: vmovupd (%rdi), %zmm1 {%k1} + +; AVX2-LABEL: test5 +; AVX2: vpmaskmovq +; AVX2: vblendvpd +; AVX2: vpmaskmovq +; AVX2: vblendvpd +define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x double> @llvm.masked.load.v8f64(i8* %addr, <8 x double>%dst, i32 4, <8 x i1>%mask) + ret <8 x double> %res +} + + +declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>) +declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>) +declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>) +declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>) +declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>) \ No newline at end of file Index: test/Transforms/LoopVectorize/X86/mask1.ll =================================================================== --- test/Transforms/LoopVectorize/X86/mask1.ll +++ test/Transforms/LoopVectorize/X86/mask1.ll @@ -0,0 +1,83 @@ +; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1 +; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2 +; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + +; The source code: +; +;void foo(int *A, int *B, int *trigger) { +; +; for (int i=0; i<10000; i++) { +; if (trigger[i] < 100) { +; A[i] = B[i] + trigger[i]; +; } +; } +;} + + +;AVX2: llvm.masked.load.v8i32 +;AVX2: llvm.masked.store.v8i32 +;AVX512: llvm.masked.load.v16i32 +;AVX512: llvm.masked.store.v16i32 +;AVX1-NOT: llvm.masked + +; Function Attrs: nounwind uwtable +define void @foo(i32* %A, i32* %B, i32* %trigger) { +entry: + %A.addr = alloca i32*, align 8 + %B.addr = alloca i32*, align 8 + %trigger.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8 + store i32* %B, i32** %B.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32* %i, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom + %3 = load i32* %arrayidx, align 4 + %cmp1 = icmp slt i32 %3, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load i32** %B.addr, align 8 + %arrayidx3 = getelementptr inbounds i32* %5, i64 %idxprom2 + %6 = load i32* %arrayidx3, align 4 + %7 = load i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load i32** %trigger.addr, align 8 + %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4 + %9 = load i32* %arrayidx5, align 4 + %add = add nsw i32 %6, %9 + %10 = load i32* %i, align 4 + %idxprom6 = sext i32 %10 to i64 + %11 = load i32** %A.addr, align 8 + %arrayidx7 = getelementptr inbounds i32* %11, i64 %idxprom6 + store i32 %add, i32* %arrayidx7, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %12 = load i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Transforms/LoopVectorize/X86/mask2.ll =================================================================== --- test/Transforms/LoopVectorize/X86/mask2.ll +++ test/Transforms/LoopVectorize/X86/mask2.ll @@ -0,0 +1,84 @@ +; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1 +; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2 +; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + +; The source code: +; +;void foo(float *A, float *B, int *trigger) { +; +; for (int i=0; i<10000; i++) { +; if (trigger[i] < 100) { +; A[i] = B[i] + trigger[i]; +; } +; } +;} + + +;AVX2: llvm.masked.load.v8f32 +;AVX2: llvm.masked.store.v8f32 +;AVX512: llvm.masked.load.v16f32 +;AVX512: llvm.masked.store.v16f32 +;AVX1-NOT: llvm.masked + +; Function Attrs: nounwind uwtable +define void @foo(float* %A, float* %B, i32* %trigger) { +entry: + %A.addr = alloca float*, align 8 + %B.addr = alloca float*, align 8 + %trigger.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store float* %A, float** %A.addr, align 8 + store float* %B, float** %B.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32* %i, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom + %3 = load i32* %arrayidx, align 4 + %cmp1 = icmp slt i32 %3, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load float** %B.addr, align 8 + %arrayidx3 = getelementptr inbounds float* %5, i64 %idxprom2 + %6 = load float* %arrayidx3, align 4 + %7 = load i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load i32** %trigger.addr, align 8 + %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4 + %9 = load i32* %arrayidx5, align 4 + %conv = sitofp i32 %9 to float + %add = fadd float %6, %conv + %10 = load i32* %i, align 4 + %idxprom6 = sext i32 %10 to i64 + %11 = load float** %A.addr, align 8 + %arrayidx7 = getelementptr inbounds float* %11, i64 %idxprom6 + store float %add, float* %arrayidx7, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %12 = load i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Transforms/LoopVectorize/X86/mask3.ll =================================================================== --- test/Transforms/LoopVectorize/X86/mask3.ll +++ test/Transforms/LoopVectorize/X86/mask3.ll @@ -0,0 +1,84 @@ +; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1 +; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2 +; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + +; The source code: +; +;void foo(double *A, double *B, int *trigger) { +; +; for (int i=0; i<10000; i++) { +; if (trigger[i] < 100) { +; A[i] = B[i] + trigger[i]; +; } +; } +;} + + +;AVX2: llvm.masked.load.v4f64 +;AVX2: llvm.masked.store.v4f64 +;AVX512: llvm.masked.load.v8f64 +;AVX512: llvm.masked.store.v8f64 +;AVX1-NOT: llvm.masked + +; Function Attrs: nounwind uwtable +define void @foo(double* %A, double* %B, i32* %trigger) #0 { +entry: + %A.addr = alloca double*, align 8 + %B.addr = alloca double*, align 8 + %trigger.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store double* %A, double** %A.addr, align 8 + store double* %B, double** %B.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32* %i, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom + %3 = load i32* %arrayidx, align 4 + %cmp1 = icmp slt i32 %3, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load double** %B.addr, align 8 + %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2 + %6 = load double* %arrayidx3, align 8 + %7 = load i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load i32** %trigger.addr, align 8 + %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4 + %9 = load i32* %arrayidx5, align 4 + %conv = sitofp i32 %9 to double + %add = fadd double %6, %conv + %10 = load i32* %i, align 4 + %idxprom6 = sext i32 %10 to i64 + %11 = load double** %A.addr, align 8 + %arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6 + store double %add, double* %arrayidx7, align 8 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %12 = load i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Transforms/LoopVectorize/X86/mask4.ll =================================================================== --- test/Transforms/LoopVectorize/X86/mask4.ll +++ test/Transforms/LoopVectorize/X86/mask4.ll @@ -0,0 +1,83 @@ +; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1 +; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2 +; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + +; The source code: +; +;void foo(double *A, double *B, int *trigger) { +; +; for (int i=0; i<10000; i++) { +; if (trigger[i] < 100) { +; A[i] = B[i*2] + trigger[i]; << non-cosecutive access +; } +; } +;} + + +;AVX2-NOT: llvm.masked +;AVX512-NOT: llvm.masked +;AVX1-NOT: llvm.masked + +; Function Attrs: nounwind uwtable +define void @foo(double* %A, double* %B, i32* %trigger) { +entry: + %A.addr = alloca double*, align 8 + %B.addr = alloca double*, align 8 + %trigger.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store double* %A, double** %A.addr, align 8 + store double* %B, double** %B.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32* %i, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom + %3 = load i32* %arrayidx, align 4 + %cmp1 = icmp slt i32 %3, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32* %i, align 4 + %mul = mul nsw i32 %4, 2 + %idxprom2 = sext i32 %mul to i64 + %5 = load double** %B.addr, align 8 + %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2 + %6 = load double* %arrayidx3, align 8 + %7 = load i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load i32** %trigger.addr, align 8 + %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4 + %9 = load i32* %arrayidx5, align 4 + %conv = sitofp i32 %9 to double + %add = fadd double %6, %conv + %10 = load i32* %i, align 4 + %idxprom6 = sext i32 %10 to i64 + %11 = load double** %A.addr, align 8 + %arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6 + store double %add, double* %arrayidx7, align 8 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %12 = load i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: utils/TableGen/CodeGenTarget.cpp =================================================================== --- utils/TableGen/CodeGenTarget.cpp +++ utils/TableGen/CodeGenTarget.cpp @@ -539,7 +539,8 @@ // variants with iAny types; otherwise, if the intrinsic is not // overloaded, all the types can be specified directly. assert(((!TyEl->isSubClassOf("LLVMExtendedType") && - !TyEl->isSubClassOf("LLVMTruncatedType")) || + !TyEl->isSubClassOf("LLVMTruncatedType") && + !TyEl->isSubClassOf("LLVMVectorSameWidth")) || VT == MVT::iAny || VT == MVT::vAny) && "Expected iAny or vAny type"); } else Index: utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- utils/TableGen/IntrinsicEmitter.cpp +++ utils/TableGen/IntrinsicEmitter.cpp @@ -257,7 +257,8 @@ IIT_ANYPTR = 26, IIT_V1 = 27, IIT_VARARG = 28, - IIT_HALF_VEC_ARG = 29 + IIT_HALF_VEC_ARG = 29, + IIT_SAME_VEC_WIDTH_ARG = 30 }; @@ -305,6 +306,13 @@ Sig.push_back(IIT_TRUNC_ARG); else if (R->isSubClassOf("LLVMHalfElementsVectorType")) Sig.push_back(IIT_HALF_VEC_ARG); + else if (R->isSubClassOf("LLVMVectorSameWidth")) { + Sig.push_back(IIT_SAME_VEC_WIDTH_ARG); + Sig.push_back((Number << 2) | ArgCodes[Number]); + MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy")); + EncodeFixedValueType(VT, Sig); + return; + } else Sig.push_back(IIT_ARG); return Sig.push_back((Number << 2) | ArgCodes[Number]);