Index: ../include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- ../include/llvm/CodeGen/SelectionDAGNodes.h +++ ../include/llvm/CodeGen/SelectionDAGNodes.h @@ -2135,10 +2135,11 @@ assert(getValue().getValueType() == getValueType(0) && "Incompatible type of the PathThru value in MaskedGatherSDNode"); assert(getMask().getValueType().getVectorNumElements() == - getValueType(0).getVectorNumElements() && - "Vector width mismatch between mask and data"); - assert(getMask().getValueType().getScalarType() == MVT::i1 && + getValueType(0).getVectorNumElements() && "Vector width mismatch between mask and data"); + assert(getIndex().getValueType().getVectorNumElements() == + getValueType(0).getVectorNumElements() && + "Vector width mismatch between index and data"); } static bool classof(const SDNode *N) { @@ -2154,13 +2155,14 @@ friend class SelectionDAG; MaskedScatterSDNode(unsigned Order, DebugLoc dl,ArrayRef Operands, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) - : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs, - MemVT, MMO) { + : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs, MemVT, + MMO) { assert(getMask().getValueType().getVectorNumElements() == - getValue().getValueType().getVectorNumElements() && - "Vector width mismatch between mask and data"); - assert(getMask().getValueType().getScalarType() == MVT::i1 && + getValue().getValueType().getVectorNumElements() && "Vector width mismatch between mask and data"); + assert(getIndex().getValueType().getVectorNumElements() == + getValue().getValueType().getVectorNumElements() && + "Vector width mismatch between index and data"); } static bool classof(const SDNode *N) { Index: ../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- ../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ ../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -65,8 +65,10 @@ case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break; - case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N));break; - case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N));break; + case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N)); break; + case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N)); + break; + case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast(N)); break; case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break; case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break; case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break; @@ -493,6 +495,25 @@ ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } + +SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue ExtSrc0 = GetPromotedInteger(N->getValue()); + assert(NVT == ExtSrc0.getValueType() && + "Gather result type and the passThru agrument type should be ther same"); + + SDLoc dl(N); + SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(), + N->getIndex()}; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + // Legalized the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + /// Promote the overflow flag of an overflowing arithmetic node. SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Simply change the return type of the boolean result. @@ -879,6 +900,10 @@ OpNo); break; case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), OpNo); break; + case ISD::MGATHER: Res = PromoteIntOp_MGATHER(cast(N), + OpNo); break; + case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast(N), + OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; @@ -1168,35 +1193,29 @@ "Unexpected data legalization in MSTORE"); DataOp = GetWidenedVector(DataOp); - if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) - Mask = GetWidenedVector(Mask); - else { - EVT BoolVT = getSetCCResultType(DataOp.getValueType()); - - // We can't use ModifyToType() because we should fill the mask with - // zeroes - unsigned WidenNumElts = BoolVT.getVectorNumElements(); - unsigned MaskNumElts = MaskVT.getVectorNumElements(); - - unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); - Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; + EVT BoolVT = getSetCCResultType(DataOp.getValueType()); + unsigned WidenNumElts = BoolVT.getVectorNumElements(); + unsigned MaskNumElts = MaskVT.getVectorNumElements(); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); - } + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); } } else - Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType()); + Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), TruncateStore); } -SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){ +SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, + unsigned OpNo) { assert(OpNo == 2 && "Only know how to promote the mask!"); EVT DataVT = N->getValueType(0); SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); @@ -1205,6 +1224,21 @@ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N, + unsigned OpNo) { + + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, + unsigned OpNo) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op); Index: ../lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- ../lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ ../lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -246,6 +246,7 @@ SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); + SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_SDIV(SDNode *N); @@ -294,6 +295,8 @@ SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -683,6 +686,7 @@ SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); + SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); @@ -711,6 +715,7 @@ SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); + SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_Convert(SDNode *N); @@ -750,7 +755,12 @@ /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. - SDValue ModifyToType(SDValue InOp, EVT WidenVT); + /// When FillWithZeroes is "on" the vector will be widened with + /// zeroes. UseExistingVal means that the existing elements from InOp + /// should be used for the widening. + /// By defalut, the vector will be widened with undefined values. + SDValue ModifyToType(SDValue InOp, EVT WidenVT, bool FillWithZeroes = false, + bool UseExistingVal = false); //===--------------------------------------------------------------------===// Index: ../lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- ../lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ ../lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1971,6 +1971,9 @@ case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast(N)); break; + case ISD::MGATHER: + Res = WidenVecRes_MGATHER(cast(N)); + break; case ISD::ADD: case ISD::AND: @@ -2696,25 +2699,19 @@ ISD::LoadExtType ExtType = N->getExtensionType(); SDLoc dl(N); - if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) - Mask = GetWidenedVector(Mask); - else { - EVT BoolVT = getSetCCResultType(WidenVT); - - // We can't use ModifyToType() because we should fill the mask with - // zeroes - unsigned WidenNumElts = BoolVT.getVectorNumElements(); - unsigned MaskNumElts = MaskVT.getVectorNumElements(); + EVT BoolVT = getSetCCResultType(WidenVT); - unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); - Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; + unsigned WidenNumElts = BoolVT.getVectorNumElements(); + unsigned MaskNumElts = MaskVT.getVectorNumElements(); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); - } + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), @@ -2725,6 +2722,41 @@ return Res; } +SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) { + + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + MVT MaskVT = Mask.getSimpleValueType(); + SDValue Src0 = GetWidenedVector(N->getValue()); + SDLoc dl(N); + + if (getTypeAction(MaskVT) == TargetLowering::TypePromoteInteger) { + Mask = GetPromotedInteger(Mask); + MaskVT = Mask.getSimpleValueType(); + } + MVT WideMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), + WidenVT.getVectorNumElements()); + Mask = ModifyToType(Mask, WideMaskVT, true); + + SDValue Index = N->getIndex(); + if (getTypeAction(Index.getValueType()) == TargetLowering::TypePromoteInteger) + Index = GetPromotedInteger(Index); + + MVT WideIndexVT = + MVT::getVectorVT(Index.getSimpleValueType().getScalarType(), + WidenVT.getVectorNumElements()); + Index = ModifyToType(Index, WideIndexVT, false, true); + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(WidenVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + + // Legalized the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), @@ -2881,6 +2913,7 @@ case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; + case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break; @@ -3097,37 +3130,64 @@ SDValue Mask = MST->getMask(); EVT MaskVT = Mask.getValueType(); SDValue StVal = MST->getValue(); + + assert(OpNo == 3 && "Unexpected operand number"); // Widen the value SDValue WideVal = GetWidenedVector(StVal); SDLoc dl(N); - if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) - Mask = GetWidenedVector(Mask); - else { - // The mask should be widened as well - EVT BoolVT = getSetCCResultType(WideVal.getValueType()); - // We can't use ModifyToType() because we should fill the mask with - // zeroes - unsigned WidenNumElts = BoolVT.getVectorNumElements(); - unsigned MaskNumElts = MaskVT.getVectorNumElements(); + // The mask should be widened as well + EVT BoolVT = getSetCCResultType(WideVal.getValueType()); + unsigned WidenNumElts = BoolVT.getVectorNumElements(); + unsigned MaskNumElts = MaskVT.getVectorNumElements(); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; - unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); - Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); - } - assert(Mask.getValueType().getVectorNumElements() == - WideVal.getValueType().getVectorNumElements() && - "Mask and data vectors should have the same number of elements"); return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(), Mask, MST->getMemoryVT(), MST->getMemOperand(), false); } +SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { + assert(OpNo == 1 && "Can widen only data operand of mscatter"); + MaskedScatterSDNode *MSC = cast(N); + SDValue DataOp = MSC->getValue(); + SDValue Mask = MSC->getMask(); + MVT MaskVT = Mask.getSimpleValueType(); + + // Widen the value + SDValue WideVal = GetWidenedVector(DataOp); + unsigned NumElts = WideVal.getValueType().getVectorNumElements(); + SDLoc dl(N); + + // The mask should be widened as well + if (getTypeAction(MaskVT) == TargetLowering::TypePromoteInteger) { + Mask = GetPromotedInteger(Mask); + MaskVT = Mask.getSimpleValueType(); + } + + MVT WideMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ModifyToType(Mask, WideMaskVT, true); + + // Widen index + SDValue Index = MSC->getIndex(); + MVT WideIndexVT = + MVT::getVectorVT(Index.getSimpleValueType().getScalarType(), NumElts); + Index = ModifyToType(Index, WideIndexVT, false, true); + + SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), + MSC->getMemoryVT(), dl, Ops, + MSC->getMemOperand()); +} + SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { SDValue InOp0 = GetWidenedVector(N->getOperand(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(1)); @@ -3591,12 +3651,18 @@ /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. -SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { +SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT, + bool FillWithZeroes, + bool UseExistingVal) { // Note that InOp might have been widened so it might already have // the right width or it might need be narrowed. EVT InVT = InOp.getValueType(); + EVT EltVT = NVT.getVectorElementType(); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + SDLoc dl(InOp); // Check if InOp already has the right width. @@ -3605,13 +3671,19 @@ unsigned InNumElts = InVT.getVectorNumElements(); unsigned WidenNumElts = NVT.getVectorNumElements(); + SDValue FillVal; if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) { unsigned NumConcat = WidenNumElts / InNumElts; SmallVector Ops(NumConcat); - SDValue UndefVal = DAG.getUNDEF(InVT); + if (FillWithZeroes) + FillVal = DAG.getConstant(0, dl, InVT); + else if (UseExistingVal) + FillVal = InOp; + else + FillVal = DAG.getUNDEF(InVT); Ops[0] = InOp; for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = UndefVal; + Ops[i] = FillVal; return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops); } @@ -3623,7 +3695,6 @@ // Fall back to extract and build. SmallVector Ops(WidenNumElts); - EVT EltVT = NVT.getVectorElementType(); unsigned MinNumElts = std::min(WidenNumElts, InNumElts); unsigned Idx; for (Idx = 0; Idx < MinNumElts; ++Idx) @@ -3631,8 +3702,13 @@ ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); - SDValue UndefVal = DAG.getUNDEF(EltVT); + if (FillWithZeroes) + FillVal = DAG.getConstant(0, dl, EltVT); + else if (UseExistingVal) + FillVal = Ops[0]; + else + FillVal = DAG.getUNDEF(EltVT); for ( ; Idx < WidenNumElts; ++Idx) - Ops[Idx] = UndefVal; + Ops[Idx] = FillVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); } Index: ../lib/Target/X86/X86ISelLowering.cpp =================================================================== --- ../lib/Target/X86/X86ISelLowering.cpp +++ ../lib/Target/X86/X86ISelLowering.cpp @@ -1391,7 +1391,7 @@ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } - setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); @@ -1545,7 +1545,7 @@ setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } - if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } @@ -1571,6 +1571,8 @@ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { @@ -19255,33 +19257,190 @@ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } +/// Modifies a vector input (widen or narrows) to a vector of NVT. The +/// input vector must have the same element type as NVT. +static SDValue ModifyToType(SDValue InOp, EVT NVT, SelectionDAG &DAG, + bool FillWithZeroes = false, + bool UseExistingVal = false) { + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + // Check if InOp already has the right width. + EVT InVT = InOp.getValueType(); + if (InVT == NVT) + return InOp; + + assert(InVT.getVectorElementType() == NVT.getVectorElementType() && + "input and widen element type must match"); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned WidenNumElts = NVT.getVectorNumElements(); + EVT EltVT = NVT.getVectorElementType(); + + SDLoc dl(InOp); + SDValue FillVal; + if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) { + unsigned NumConcat = WidenNumElts / InNumElts; + if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) && NumConcat > 2) { + // Special case, because CONCAT_VECTORS with many operands is not + // converted to the BUILD_VECTOR + SmallVector Ops; + for (unsigned i = 0; i < InNumElts; ++i) + Ops.push_back(InOp.getOperand(i)); + for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) + Ops.push_back( + UseExistingVal ? InOp.getOperand(0) : + FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT)); + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + } + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { + SDValue N0 = InOp.getOperand(0); + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef() || (N0 == N1 && UseExistingVal)) { + InOp = InOp.getOperand(0); + InVT = InOp.getValueType(); + InNumElts = InVT.getVectorNumElements(); + NumConcat = WidenNumElts / InNumElts; + } + } + SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); + if (UseExistingVal && NumConcat > 2) { + SDValue ExtInOp = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, + DAG.getUNDEF(NVT), InOp, ZeroIndex); + SmallVector ShuffleMask; + for (unsigned i = 0; i < NumConcat; ++i) + for (unsigned j = 0; j < InNumElts; ++j) + ShuffleMask.push_back(j); + return DAG.getVectorShuffle(NVT, dl, ExtInOp, DAG.getUNDEF(NVT), + ShuffleMask); + } + if (NumConcat > 2) { + FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, + InOp, ZeroIndex); + } + FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) : + UseExistingVal ? InOp : DAG.getUNDEF(InVT); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, InOp, FillVal); + } + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (WidenNumElts < InNumElts && InNumElts % WidenNumElts) + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + + // Fall back to extract and build. + SmallVector Ops(WidenNumElts); + unsigned MinNumElts = std::min(WidenNumElts, InNumElts); + unsigned Idx; + for (Idx = 0; Idx < MinNumElts; ++Idx) + Ops[Idx] = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, + DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + + if (FillWithZeroes) + FillVal = DAG.getConstant(0, dl, EltVT); + else if (UseExistingVal) + FillVal = Ops[0]; + else + FillVal = DAG.getUNDEF(EltVT); + for (; Idx < WidenNumElts; ++Idx) + Ops[Idx] = FillVal; + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); +} + static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + MaskedScatterSDNode *N = cast(Op.getNode()); - EVT VT = N->getValue().getValueType(); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - // X86 scatter kills mask register, so its type should be added to - // the list of return values - if (N->getNumValues() == 1) { - SDValue Index = N->getIndex(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getValueType().is512BitVector()) - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - - SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - - SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + EVT MemVT = N->getMemoryVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // Promoted data type + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ModifyToType(Index, NewIndexVT, DAG, false, true); + + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ModifyToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; } - return Op; + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ModifyToType(N->getIndex(), NewIndexVT, DAG, false, true); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ModifyToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ModifyToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); } static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, @@ -19290,17 +19449,59 @@ "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); - EVT VT = Op.getValueType(); - assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); SDLoc dl(Op); - + MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && !Index.getValueType().is512BitVector()) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ModifyToType(Index, NewIndexVT, DAG, false, true); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ModifyToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ModifyToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } return Op; } Index: ../test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- ../test/CodeGen/X86/masked_gather_scatter.ll +++ ../test/CodeGen/X86/masked_gather_scatter.ll @@ -1,19 +1,23 @@ -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=KNL +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; KNL-LABEL: test1 -; KNL: kxnorw %k1, %k1, %k1 -; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} define <16 x float> @test1(float* %base, <16 x i32> %ind) { +; ALL-LABEL: test1: +; ALL: ## BB#0: +; ALL-NEXT: kxnorw %k1, %k1, %k1 +; ALL-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind - + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -21,11 +25,14 @@ declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>) declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) - -; KNL-LABEL: test2 -; KNL: kmovw %esi, %k1 -; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} + define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { +; ALL-LABEL: test2: +; ALL: ## BB#0: +; ALL-NEXT: kmovw %esi, %k1 +; ALL-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -37,10 +44,13 @@ ret <16 x float> %res } -; KNL-LABEL: test3 -; KNL: kmovw %esi, %k1 -; KNL: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { +; ALL-LABEL: test3: +; ALL: ## BB#0: +; ALL-NEXT: kmovw %esi, %k1 +; ALL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -52,13 +62,16 @@ ret <16 x i32> %res } -; KNL-LABEL: test4 -; KNL: kmovw %esi, %k1 -; KNL: kmovw -; KNL: vpgatherdd -; KNL: vpgatherdd - define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { +; ALL-LABEL: test4: +; ALL: ## BB#0: +; ALL-NEXT: kmovw %esi, %k1 +; ALL-NEXT: kmovw %k1, %k2 +; ALL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} +; ALL-NEXT: vmovaps %zmm1, %zmm2 +; ALL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; ALL-NEXT: vpaddd %zmm2, %zmm1, %zmm0 +; ALL-NEXT: retq %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -71,12 +84,14 @@ ret <16 x i32> %res } -; KNL-LABEL: test5 -; KNL: kmovw %k1, %k2 -; KNL: vpscatterdd {{.*}}%k2 -; KNL: vpscatterdd {{.*}}%k1 - define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; ALL-LABEL: test5: +; ALL: ## BB#0: +; ALL-NEXT: kmovw %esi, %k1 +; ALL-NEXT: kmovw %k1, %k2 +; ALL-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} +; ALL-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; ALL-NEXT: retq %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -91,12 +106,15 @@ declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) -; KNL-LABEL: test6 -; KNL: kxnorw %k1, %k1, %k1 -; KNL: kxnorw %k2, %k2, %k2 -; KNL: vpgatherqd (,%zmm{{.*}}), %ymm{{.*}} {%k2} -; KNL: vpscatterqd %ymm{{.*}}, (,%zmm{{.*}}) {%k1} define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { +; ALL-LABEL: test6: +; ALL: ## BB#0: +; ALL-NEXT: kxnorw %k1, %k1, %k1 +; ALL-NEXT: kxnorw %k2, %k2, %k2 +; ALL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; ALL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> , <8 x i32> undef) @@ -105,12 +123,28 @@ } ; In this case the index should be promoted to <8 x i64> for KNL -; KNL-LABEL: test7 -; KNL: vpmovsxdq %ymm0, %zmm0 -; KNL: kmovw %k1, %k2 -; KNL: vpgatherqd {{.*}} {%k2} -; KNL: vpgatherqd {{.*}} {%k1} define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { +; KNL-LABEL: test7: +; KNL: ## BB#0: +; KNL-NEXT: movzbl %sil, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL-NEXT: kmovw %k1, %k2 +; KNL-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} +; KNL-NEXT: vmovaps %zmm1, %zmm2 +; KNL-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test7: +; SKX: ## BB#0: +; SKX-NEXT: kmovb %esi, %k1 +; SKX-NEXT: kmovw %k1, %k2 +; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} +; SKX-NEXT: vmovaps %zmm1, %zmm2 +; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} +; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer @@ -125,15 +159,36 @@ ; No uniform base in this case, index <8 x i64> contains addresses, ; each gather call will be split into two -; KNL-LABEL: test8 -; KNL: kshiftrw $8, %k1, %k2 -; KNL: vpgatherqd -; KNL: vpgatherqd -; KNL: vinserti64x4 -; KNL: vpgatherqd -; KNL: vpgatherqd -; KNL: vinserti64x4 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { +; KNL-LABEL: test8: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: kmovw %k2, %k3 +; KNL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} +; KNL-NEXT: kmovw %k1, %k3 +; KNL-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 +; KNL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; KNL-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test8: +; SKX: ## BB#0: +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: kmovw %k2, %k3 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} +; SKX-NEXT: kmovw %k1, %k3 +; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} +; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} +; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; SKX-NEXT: retq %imask = bitcast i16 %mask to <16 x i1> %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) @@ -147,18 +202,42 @@ ; Masked gather for agregate types ; Test9 and Test10 should give the same result (scalar and vector indices in GEP) -; KNL-LABEL: test9 -; KNL: vpbroadcastq %rdi, %zmm -; KNL: vpmovsxdq -; KNL: vpbroadcastq -; KNL: vpmuludq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpgatherqd (,%zmm - define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { +; KNL-LABEL: test9: +; KNL: ## BB#0: ## %entry +; KNL-NEXT: vpbroadcastq %rdi, %zmm2 +; KNL-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; KNL-NEXT: vpsrlq $32, %zmm1, %zmm1 +; KNL-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; KNL-NEXT: vpsllq $32, %zmm1, %zmm1 +; KNL-NEXT: vpaddq %zmm1, %zmm4, %zmm1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 +; KNL-NEXT: vpsrlq $32, %zmm0, %zmm0 +; KNL-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; KNL-NEXT: vpsllq $32, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; KNL-NEXT: kxnorw %k1, %k1, %k1 +; KNL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test9: +; SKX: ## BB#0: ## %entry +; SKX-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX-NEXT: retq entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer @@ -168,17 +247,42 @@ ret <8 x i32> %res } -; KNL-LABEL: test10 -; KNL: vpbroadcastq %rdi, %zmm -; KNL: vpmovsxdq -; KNL: vpbroadcastq -; KNL: vpmuludq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpgatherqd (,%zmm define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { +; KNL-LABEL: test10: +; KNL: ## BB#0: ## %entry +; KNL-NEXT: vpbroadcastq %rdi, %zmm2 +; KNL-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; KNL-NEXT: vpsrlq $32, %zmm1, %zmm1 +; KNL-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; KNL-NEXT: vpsllq $32, %zmm1, %zmm1 +; KNL-NEXT: vpaddq %zmm1, %zmm4, %zmm1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 +; KNL-NEXT: vpsrlq $32, %zmm0, %zmm0 +; KNL-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; KNL-NEXT: vpsllq $32, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; KNL-NEXT: kxnorw %k1, %k1, %k1 +; KNL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test10: +; SKX: ## BB#0: ## %entry +; SKX-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX-NEXT: retq entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer @@ -189,10 +293,13 @@ } ; Splat index in GEP, requires broadcast -; KNL-LABEL: test11 -; KNL: vpbroadcastd %esi, %zmm -; KNL: vgatherdps (%rdi,%zmm define <16 x float> @test11(float* %base, i32 %ind) { +; ALL-LABEL: test11: +; ALL: ## BB#0: +; ALL-NEXT: vpbroadcastd %esi, %zmm1 +; ALL-NEXT: kxnorw %k1, %k1, %k1 +; ALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; ALL-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -204,10 +311,13 @@ } ; We are checking the uniform base here. It is taken directly from input to vgatherdps -; KNL-LABEL: test12 -; KNL: kxnorw %k1, %k1, %k1 -; KNL: vgatherdps (%rdi,%zmm define <16 x float> @test12(float* %base, <16 x i32> %ind) { +; ALL-LABEL: test12: +; ALL: ## BB#0: +; ALL-NEXT: kxnorw %k1, %k1, %k1 +; ALL-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind @@ -217,10 +327,12 @@ } ; The same as the previous, but the mask is undefined -; KNL-LABEL: test13 -; KNL-NOT: kxnorw -; KNL: vgatherdps (%rdi,%zmm define <16 x float> @test13(float* %base, <16 x i32> %ind) { +; ALL-LABEL: test13: +; ALL: ## BB#0: +; ALL-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind @@ -230,10 +342,38 @@ } ; The base pointer is not splat, can't find unform base -; KNL-LABEL: test14 -; KNL: vgatherqps (,%zmm0) -; KNL: vgatherqps (,%zmm0) define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { +; KNL-LABEL: test14: +; KNL: ## BB#0: +; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: vpbroadcastq %xmm0, %zmm0 +; KNL-NEXT: vmovd %esi, %xmm1 +; KNL-NEXT: vpbroadcastd %xmm1, %ymm1 +; KNL-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL-NEXT: vpsllq $2, %zmm1, %zmm1 +; KNL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: kshiftrw $8, %k0, %k1 +; KNL-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; KNL-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test14: +; SKX: ## BB#0: +; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 +; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 +; SKX-NEXT: vmovd %esi, %xmm1 +; SKX-NEXT: vpbroadcastd %xmm1, %ymm1 +; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 +; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: kshiftrw $8, %k0, %k1 +; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} +; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -244,4 +384,372 @@ ret <16 x float>%res } +declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>) + +; Mask that requires type legalization on KNL +; Gather smaller than existing instruction +define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { +; KNL-LABEL: test15: +; KNL: ## BB#0: +; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; KNL-NEXT: vpmovsxdq %ymm0, %zmm2 +; KNL-NEXT: vpmovsxdq %ymm1, %zmm0 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test15: +; SKX: ## BB#0: +; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind + %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +; Mask that requires type legalization on KNL +; Gather smaller than existing instruction +define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { +; KNL-LABEL: test16: +; KNL: ## BB#0: +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; KNL-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test16: +; SKX: ## BB#0: +; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind + %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) + ret <4 x double>%res +} + +define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { +; KNL-LABEL: test17: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; KNL-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test17: +; SKX: ## BB#0: +; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind + %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double>%res +} + +declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) +declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) +declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) +declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) +declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) + +define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { +; KNL-LABEL: test18: +; KNL: ## BB#0: +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; KNL-NEXT: vpmovsxdq %ymm2, %zmm2 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test18: +; SKX: ## BB#0: +; SKX-NEXT: vpmovd2m %xmm2, %k1 +; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) { +; KNL-LABEL: test19: +; KNL: ## BB#0: +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test19: +; SKX: ## BB#0: +; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} +; SKX-NEXT: retq + %gep = getelementptr double, double* %ptr, <4 x i64> %ind + call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) + ret void +} +; Data type requires widening +define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1>%mask) { +; KNL-LABEL: test20: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm1, %zmm3, %zmm1 +; KNL-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test20: +; SKX: ## BB#0: +; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm1, %ymm1 +; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kshiftlw $2, %k0, %k0 +; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) + ret void +} + +; Data type requires promotion +define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { +; KNL-LABEL: test21: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm1, %zmm3, %zmm1 +; KNL-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test21: +; SKX: ## BB#0: +; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm1, %ymm1 +; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kshiftlw $2, %k0, %k0 +; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) + ret void +} + +; The result type requires widening +declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>) + +define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { +; KNL-LABEL: test22: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; KNL-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test22: +; SKX: ## BB#0: +; SKX-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: kshiftlw $2, %k0, %k0 +; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: vgatherqps (%rdi,%ymm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind + %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) + ret <2 x float>%res +} + +declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) + +define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { +; KNL-LABEL: test23: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; KNL-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test23: +; SKX: ## BB#0: +; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind + %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) + ret <2 x i32>%res +} + +define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { +; KNL-LABEL: test24: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm1 +; KNL-NEXT: movb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpgatherqq (%rdi,%zmm1,8), %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test24: +; SKX: ## BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind + %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> , <2 x i32> undef) + ret <2 x i32>%res +} + +define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { +; KNL-LABEL: test25: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; KNL-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test25: +; SKX: ## BB#0: +; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind + %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) + ret <2 x i64>%res +} + +; Legal on SKX; all-ones mask +define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { +; KNL-LABEL: test26: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm0, %zmm2, %zmm0 +; KNL-NEXT: movb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} +; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test26: +; SKX: ## BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind + %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> , <2 x i64> %src0) + ret <2 x i64>%res +} + +; Result type requires widening; all-ones mask +define <2 x float> @test27(float* %base, <2 x i32> %ind) { +; KNL-LABEL: test27: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm1 +; KNL-NEXT: movb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test27: +; SKX: ## BB#0: +; SKX-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm1 +; SKX-NEXT: movb $3, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1} +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind + %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> , <2 x float> undef) + ret <2 x float>%res +} + +; Data type requires promotion, mask is all-ones +define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { +; KNL-LABEL: test28: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,0,1,0,1,0,1] +; KNL-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: movb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test28: +; SKX: ## BB#0: +; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm1, %ymm1 +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX-NEXT: movb $3, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) + ret void +}