Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -747,6 +747,7 @@ SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); + SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3353,6 +3353,7 @@ case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; + case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break; case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break; @@ -3604,36 +3605,85 @@ false, MST->isCompressingStore()); } -SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { - assert(OpNo == 1 && "Can widen only data operand of mscatter"); - MaskedScatterSDNode *MSC = cast(N); - SDValue DataOp = MSC->getValue(); - SDValue Mask = MSC->getMask(); +SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) { + assert(OpNo == 4 && "Can widen only the index of mgather"); + auto *MG = cast(N); + SDValue DataOp = MG->getValue(); + SDValue Mask = MG->getMask(); + SDValue Scale = MG->getScale(); EVT MaskVT = Mask.getValueType(); - SDValue Scale = MSC->getScale(); + EVT DataVT = DataOp.getValueType(); + + // Widen index. + SDValue Index = GetWidenedVector(MG->getIndex()); + unsigned NumElts = Index.getValueType().getVectorNumElements(); // Widen the value. - SDValue WideVal = GetWidenedVector(DataOp); - EVT WideVT = WideVal.getValueType(); - unsigned NumElts = WideVT.getVectorNumElements(); - SDLoc dl(N); + EVT WideDataVT = EVT::getVectorVT(*DAG.getContext(), + DataVT.getVectorElementType(), + NumElts); + DataOp = ModifyToType(DataOp, WideDataVT); // The mask should be widened as well. EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MaskVT.getVectorElementType(), NumElts); Mask = ModifyToType(Mask, WideMaskVT, true); - // Widen index. + SDLoc dl(N); + SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index, + Scale}; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideDataVT, MVT::Other), + MG->getMemoryVT(), dl, Ops, + MG->getMemOperand()); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, N->getValueType(0), Res, + DAG.getConstant(0, dl, + TLI.getVectorIdxTy(DAG.getDataLayout()))); + + ReplaceValueWith(SDValue(N, 0), Res.getValue(0)); + return SDValue(); +} + +SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { + MaskedScatterSDNode *MSC = cast(N); + SDValue DataOp = MSC->getValue(); + SDValue Mask = MSC->getMask(); SDValue Index = MSC->getIndex(); - EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), - Index.getValueType().getScalarType(), - NumElts); - Index = ModifyToType(Index, WideIndexVT); + SDValue Scale = MSC->getScale(); + + unsigned NumElts; + if (OpNo == 1) { + DataOp = GetWidenedVector(DataOp); + NumElts = DataOp.getValueType().getVectorNumElements(); + + // Widen index. + EVT IndexVT = Index.getValueType(); + EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), + IndexVT.getVectorElementType(), NumElts); + Index = ModifyToType(Index, WideIndexVT); + } else if (OpNo == 4) { + Index = GetWidenedVector(Index); + NumElts = Index.getValueType().getVectorNumElements(); + + // Widen the data. + EVT DataVT = DataOp.getValueType(); + EVT WideDataVT = EVT::getVectorVT(*DAG.getContext(), + DataVT.getVectorElementType(), NumElts); + DataOp = ModifyToType(DataOp, WideDataVT); + } else + llvm_unreachable("Can't widen this operand of mscatter"); - SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index, + // The mask should be widened as well. + EVT MaskVT = Mask.getValueType(); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), + MaskVT.getVectorElementType(), NumElts); + + Mask = ModifyToType(Mask, WideMaskVT, true); + SDValue Ops[] = {MSC->getChain(), DataOp, Mask, MSC->getBasePtr(), Index, Scale}; return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), - MSC->getMemoryVT(), dl, Ops, + MSC->getMemoryVT(), SDLoc(N), Ops, MSC->getMemOperand()); } Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -721,6 +721,16 @@ Known = std::move(KnownOut); break; } + case ISD::CONCAT_VECTORS: + Known.Zero.setAllBits(); Known.One.setAllBits(); + for (unsigned i = 0; i != Op.getNumOperands(); ++i) { + if (SimplifyDemandedBits(Op.getOperand(i), NewMask, Known2, TLO, Depth+1)) + return true; + + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + break; case ISD::SELECT: if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known, TLO, Depth+1)) return true; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -801,13 +801,6 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Provide custom widening for v2f32 setcc. This is really for VLX when - // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to - // type legalization changing the result type to v4i1 during widening. - // It works fine for SSE2 and is probably faster so no need to qualify with - // VLX support. - setOperationAction(ISD::SETCC, MVT::v2i32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); @@ -838,6 +831,9 @@ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); @@ -1757,6 +1753,9 @@ TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT == MVT::v2i32 && Subtarget.hasSSE2()) + return TypeWidenVector; + if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; @@ -18206,11 +18205,6 @@ assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); - // This is being called by type legalization because v2i32 is marked custom - // for result type legalization for v2f32. - if (VTOp0 == MVT::v2i32) - return SDValue(); - // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && @@ -19141,23 +19135,43 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - StoreSDNode *St = cast(Op.getNode()); + auto *St = cast(Op.getNode()); SDLoc dl(St); SDValue StoredVal = St->getValue(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. - assert(StoredVal.getValueType().isVector() && - StoredVal.getValueType().getVectorElementType() == MVT::i1 && - StoredVal.getValueType().getVectorNumElements() <= 8 && - "Unexpected VT"); - assert(!St->isTruncatingStore() && "Expected non-truncating store"); - assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && - "Expected AVX512F without AVX512DQI"); - - StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, - DAG.getUNDEF(MVT::v8i1), StoredVal, + if (StoredVal.getValueType().isVector() && + StoredVal.getValueType().getVectorElementType() == MVT::i1) { + assert(StoredVal.getValueType().getVectorNumElements() <= 8 && + "Unexpected VT"); + assert(!St->isTruncatingStore() && "Expected non-truncating store"); + assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && + "Expected AVX512F without AVX512DQI"); + + StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), StoredVal, + DAG.getIntPtrConstant(0, dl)); + StoredVal = DAG.getBitcast(MVT::i8, StoredVal); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + + if (St->isTruncatingStore()) + return SDValue(); + + assert(StoredVal.getValueType() == MVT::v2i32 && "Unexpected VT"); + + // Widen the vector, cast to a v2x64 type, extract the single element 64-bit + // element and store it. + StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, StoredVal, + DAG.getUNDEF(MVT::v2i32)); + MVT StVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; + MVT WideVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(WideVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, DAG.getIntPtrConstant(0, dl)); - StoredVal = DAG.getBitcast(MVT::i8, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), @@ -24717,10 +24731,24 @@ MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); - // If the index is v2i32, we're being called by type legalization and we - // should just let the default handling take care of it. - if (IndexVT == MVT::v2i32) + // If the index is v2i32, we're being called by type legalization. + if (IndexVT == MVT::v2i32) { + // If the result type is only 128-bits we can avoid widening the whole + // gather by using VPSCATTERDQ/VSCATTERDPD with 128-bit input. + if ((VT == MVT::v2f64 || VT == MVT::v2i64) && + (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + // Widen the index. + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Index, + DAG.getUNDEF(IndexVT)); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; + SDValue NewScatter = DAG.getTargetMemSDNode( + VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 1); + } return SDValue(); + } // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. @@ -24856,8 +24884,22 @@ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); // If the index is v2i32, we're being called by type legalization. - if (IndexVT == MVT::v2i32) + if (IndexVT == MVT::v2i32) { + // If the result type is only 128-bits we can avoid widening the whole + // gather by using VPGATHERDQ/VGATHERDPD with 128-bit result. + if ((VT == MVT::v2f64 || VT == MVT::v2i64) && + (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + // Widen the index. + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Index, + DAG.getUNDEF(IndexVT)); + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), + Index, N->getScale() }; + return DAG.getTargetMemSDNode( + DAG.getVTList(VT, Mask.getValueType(), MVT::Other), Ops, dl, + N->getMemoryVT(), N->getMemOperand()); + } return SDValue(); + } // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. @@ -25106,26 +25148,6 @@ Results.push_back(Res); return; } - case ISD::SETCC: { - // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when - // setCC result type is v2i1 because type legalzation will end up with - // a v4i1 setcc plus an extend. - assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); - if (N->getOperand(0).getValueType() != MVT::v2f32) - return; - SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); - SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(0), UNDEF); - SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(1), UNDEF); - SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, - N->getOperand(2)); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -25172,22 +25194,16 @@ Src, DAG.getIntPtrConstant(0, dl)); } SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); - bool WidenType = getTypeAction(*DAG.getContext(), - MVT::v2i32) == TypeWidenVector; - ResVT = WidenType ? MVT::v4i32 : MVT::v2i32; - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res, + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } if (Src.getValueType() == MVT::v2f32) { - SDValue Idx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -25462,60 +25478,54 @@ Results.push_back(Res.getValue(2)); return; } - if (VT == MVT::v2i32) { + if (VT == MVT::v2i32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); + if (Index.getValueType() != MVT::v2i64) + return; SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Gather->getValue(), DAG.getUNDEF(MVT::v2i32)); - // If the index is v2i64 we can use it directly. - if (Index.getValueType() == MVT::v2i64 && - (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { - if (!Subtarget.hasVLX()) { - // We need to widen the mask, but the instruction will only use 2 - // of its elements. So we can use undef. - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getUNDEF(MVT::v2i1)); - Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); - } - SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), - Index, Gather->getScale() }; - SDValue Res = DAG.getTargetMemSDNode( - DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, - Gather->getMemoryVT(), Gather->getMemOperand()); - SDValue Chain = Res.getValue(2); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } - EVT IndexVT = Index.getValueType(); - EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), - IndexVT.getScalarType(), 4); - // Otherwise we need to custom widen everything to avoid promotion. - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(IndexVT)); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), - Gather->getMemoryVT(), dl, Ops, - Gather->getMemOperand()); - SDValue Chain = Res.getValue(1); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); + SDValue Res = DAG.getTargetMemSDNode( + DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, + Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(2)); return; } break; } + case ISD::LOAD: { + // Custom widen v2f32 loads. + EVT VT = N->getValueType(0); + assert(VT == MVT::v2i32 && "Unexpected VT"); + if (!ISD::isNON_EXTLoad(N)) + return; + auto *Ld = cast(N); + MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), + Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + SDValue Chain = Res.getValue(1); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + Res = DAG.getBitcast(MVT::v4i32, Res); + Results.push_back(Res); + Results.push_back(Chain); + return; + } } } @@ -33159,26 +33169,39 @@ return SDValue(); // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. - MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); + // Allow v2i32 to be widened. + unsigned NumElems = std::max(4U, VT.getVectorNumElements()); + MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElems); if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT)) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + APInt Mask17 = APInt::getHighBitsSet(32, 17); if (!DAG.MaskedValueIsZero(N1, Mask17) || !DAG.MaskedValueIsZero(N0, Mask17)) return SDValue(); + SDLoc DL(N); + if (VT == MVT::v2i32) { + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + N1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N1, + DAG.getUNDEF(MVT::v2i32)); + } + // Use SplitOpsAndApply to handle AVX splitting. auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, - { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) }, - PMADDWDBuilder); + SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, VT, + { DAG.getBitcast(WVT, N0), + DAG.getBitcast(WVT, N1) }, PMADDWDBuilder); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, N->getValueType(0), Res, + DAG.getIntPtrConstant(0, DL)); } /// Optimize a single multiply with constant into two operations in order to Index: test/Analysis/CostModel/X86/alternate-shuffle-cost.ll =================================================================== --- test/Analysis/CostModel/X86/alternate-shuffle-cost.ll +++ test/Analysis/CostModel/X86/alternate-shuffle-cost.ll @@ -8,7 +8,7 @@ ; Verify the cost model for alternate shuffles. ; shufflevector instructions with illegal 64-bit vector types. -; 64-bit packed integer vectors (v2i32) are promoted to type v2i64. +; 64-bit packed integer vectors (v2i32) are widened to type v4i32. ; 64-bit packed float vectors (v2f32) are widened to type v4f32. define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) { @@ -16,8 +16,8 @@ ret <2 x i32> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32': -; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector -; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector @@ -38,8 +38,8 @@ ret <2 x i32> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32_2': -; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector -; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector Index: test/Analysis/CostModel/X86/arith.ll =================================================================== --- test/Analysis/CostModel/X86/arith.ll +++ test/Analysis/CostModel/X86/arith.ll @@ -714,14 +714,14 @@ ; A <2 x i32> gets expanded to a <2 x i64> vector. ; A <2 x i64> vector multiply is implemented using ; 3 PMULUDQ and 2 PADDS and 4 shifts. - ; SSSE3: cost of 8 {{.*}} %A0 = mul - ; SSE42: cost of 8 {{.*}} %A0 = mul - ; SLM: cost of 17 {{.*}} %A0 = mul - ; GLM: cost of 8 {{.*}} %A0 = mul - ; AVX: cost of 8 {{.*}} %A0 = mul - ; AVX2: cost of 8 {{.*}} %A0 = mul - ; AVX512F: cost of 8 {{.*}} %A0 = mul - ; AVX512BW: cost of 8 {{.*}} %A0 = mul + ; SSSE3: cost of 6 {{.*}} %A0 = mul + ; SSE42: cost of 2 {{.*}} %A0 = mul + ; SLM: cost of 11 {{.*}} %A0 = mul + ; GLM: cost of 2 {{.*}} %A0 = mul + ; AVX: cost of 2 {{.*}} %A0 = mul + ; AVX2: cost of 2 {{.*}} %A0 = mul + ; AVX512F: cost of 1 {{.*}} %A0 = mul + ; AVX512BW: cost of 1 {{.*}} %A0 = mul ; AVX512DQ: cost of 1 {{.*}} %A0 = mul %A0 = mul <2 x i32> undef, undef Index: test/Analysis/CostModel/X86/fptoui.ll =================================================================== --- test/Analysis/CostModel/X86/fptoui.ll +++ test/Analysis/CostModel/X86/fptoui.ll @@ -50,7 +50,7 @@ ; SSE42: cost of 6 {{.*}} %V2I32 = fptoui ; AVX1: cost of 6 {{.*}} %V2I32 = fptoui ; AVX2: cost of 6 {{.*}} %V2I32 = fptoui - ; AVX512F: cost of 6 {{.*}} %V2I32 = fptoui + ; AVX512F: cost of 1 {{.*}} %V2I32 = fptoui ; AVX512DQ: cost of 1 {{.*}} %V2I32 = fptoui %V2I32 = fptoui <2 x double> undef to <2 x i32> ; SSE2: cost of 13 {{.*}} %V4I32 = fptoui Index: test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -45,7 +45,7 @@ } ; AVX2-LABEL: test6 -; AVX2: Found an estimated cost of 6 {{.*}}.masked +; AVX2: Found an estimated cost of 5 {{.*}}.masked define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) @@ -61,7 +61,7 @@ } ; AVX2-LABEL: test8 -; AVX2: Found an estimated cost of 6 {{.*}}.masked +; AVX2: Found an estimated cost of 5 {{.*}}.masked define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) Index: test/Analysis/CostModel/X86/sitofp.ll =================================================================== --- test/Analysis/CostModel/X86/sitofp.ll +++ test/Analysis/CostModel/X86/sitofp.ll @@ -70,7 +70,7 @@ ; AVX512: cost of 1 {{.*}} sitofp i32 %cvt_i32_f64 = sitofp i32 undef to double - ; SSE2: cost of 20 {{.*}} sitofp <2 x i32> + ; SSE2: cost of 40 {{.*}} sitofp <2 x i32> ; AVX1: cost of 4 {{.*}} sitofp <2 x i32> ; AVX2: cost of 4 {{.*}} sitofp <2 x i32> ; AVX512: cost of 4 {{.*}} sitofp <2 x i32> Index: test/Analysis/CostModel/X86/slm-arith-costs.ll =================================================================== --- test/Analysis/CostModel/X86/slm-arith-costs.ll +++ test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -195,7 +195,7 @@ define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { entry: -; SLM: cost of 17 {{.*}} mul nsw <2 x i32> +; SLM: cost of 11 {{.*}} mul nsw <2 x i32> %res = mul nsw <2 x i32> %a, %b ret <2 x i32> %res } Index: test/Analysis/CostModel/X86/testshiftashr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftashr.ll +++ test/Analysis/CostModel/X86/testshiftashr.ll @@ -65,7 +65,7 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 12 {{.*}} ashr + ; SSE2: cost of 16 {{.*}} ashr ; SSE2-CODEGEN: shift2i32 ; SSE2-CODEGEN: psrlq @@ -320,7 +320,7 @@ define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { entry: ; SSE2: shift2i32c - ; SSE2: cost of 4 {{.*}} ashr + ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN: shift2i32c ; SSE2-CODEGEN: psrad $3 Index: test/Analysis/CostModel/X86/testshiftlshr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftlshr.ll +++ test/Analysis/CostModel/X86/testshiftlshr.ll @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 4 {{.*}} lshr + ; SSE2: cost of 16 {{.*}} lshr ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrld %0 = lshr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -322,7 +322,7 @@ ; SSE2: shift2i32c ; SSE2: cost of 1 {{.*}} lshr ; SSE2-CODEGEN: shift2i32c - ; SSE2-CODEGEN: psrlq $3 + ; SSE2-CODEGEN: psrld $3 %0 = lshr %shifttypec2i32 %a , ret %shifttypec2i32 %0 Index: test/Analysis/CostModel/X86/testshiftshl.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftshl.ll +++ test/Analysis/CostModel/X86/testshiftshl.ll @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 4 {{.*}} shl + ; SSE2: cost of 10 {{.*}} shl ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: psllq + ; SSE2-CODEGEN: pmuludq %0 = shl %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -322,7 +322,7 @@ ; SSE2: shift2i32c ; SSE2: cost of 1 {{.*}} shl ; SSE2-CODEGEN: shift2i32c - ; SSE2-CODEGEN: psllq $3 + ; SSE2-CODEGEN: pslld $3 %0 = shl %shifttypec2i32 %a , ret %shifttypec2i32 %0 Index: test/Analysis/CostModel/X86/uitofp.ll =================================================================== --- test/Analysis/CostModel/X86/uitofp.ll +++ test/Analysis/CostModel/X86/uitofp.ll @@ -70,7 +70,7 @@ ; AVX512: cost of 1 {{.*}} uitofp i32 %cvt_i32_f64 = uitofp i32 undef to double - ; SSE2: cost of 20 {{.*}} uitofp <2 x i32> + ; SSE2: cost of 40 {{.*}} uitofp <2 x i32> ; AVX1: cost of 6 {{.*}} uitofp <2 x i32> ; AVX2: cost of 6 {{.*}} uitofp <2 x i32> ; AVX512: cost of 1 {{.*}} uitofp <2 x i32> Index: test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll =================================================================== --- test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll +++ test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll @@ -7,7 +7,6 @@ define <2 x double> @a(<2 x i32> %x) nounwind { ; CHECK-LABEL: a: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: retl entry: @@ -19,7 +18,6 @@ ; CHECK-LABEL: b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttpd2dq %xmm0, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: retl entry: %y = fptosi <2 x double> %x to <2 x i32> Index: test/CodeGen/X86/2012-01-18-vbitcast.ll =================================================================== --- test/CodeGen/X86/2012-01-18-vbitcast.ll +++ test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -4,17 +4,9 @@ define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: vcast: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $16, %rsp -; CHECK-NEXT: .seh_stackalloc 16 -; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: movdqa (%rcx), %xmm0 +; CHECK-NEXT: psubd (%rdx), %xmm0 ; CHECK-NEXT: retq -; CHECK-NEXT: .seh_handlerdata -; CHECK-NEXT: .text -; CHECK-NEXT: .seh_endproc %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> %x = sub <2 x i32> %af, %bf Index: test/CodeGen/X86/2012-07-10-extload64.ll =================================================================== --- test/CodeGen/X86/2012-07-10-extload64.ll +++ test/CodeGen/X86/2012-07-10-extload64.ll @@ -34,7 +34,7 @@ ; CHECK-LABEL: load_64: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl BB: %t = load <2 x i32>, <2 x i32>* %ptr Index: test/CodeGen/X86/3dnow-intrinsics.ll =================================================================== --- test/CodeGen/X86/3dnow-intrinsics.ll +++ test/CodeGen/X86/3dnow-intrinsics.ll @@ -52,8 +52,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: pf2id %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -169,8 +168,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpeq %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -209,8 +207,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpge %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -249,8 +246,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpgt %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -723,8 +719,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: pf2iw %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -896,12 +891,10 @@ ; ; X64-LABEL: test_pswapdsi: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: pswapd -{{[0-9]+}}(%rsp), %mm0 # mm0 = mem[1,0] +; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: pswapd %mm0, %mm0 # mm0 = mm0[1,0] ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x i32> %a to x86_mmx Index: test/CodeGen/X86/avx2-masked-gather.ll =================================================================== --- test/CodeGen/X86/avx2-masked-gather.ll +++ test/CodeGen/X86/avx2-masked-gather.ll @@ -9,23 +9,21 @@ define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 -; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i32: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 -; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; NOGATHER-LABEL: masked_gather_v2i32: @@ -44,11 +42,11 @@ ; NOGATHER-NEXT: je .LBB0_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB0_4: # %else2 -; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 -; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0 +; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: %ld = load <2 x i32*>, <2 x i32*>* %ptr @@ -59,11 +57,10 @@ define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32_concat: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl @@ -71,7 +68,6 @@ ; X64-LABEL: masked_gather_v2i32_concat: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 @@ -94,12 +90,11 @@ ; NOGATHER-NEXT: je .LBB1_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB1_4: # %else2 -; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 -; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 -; NOGATHER-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0 +; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: %ld = load <2 x i32*>, <2 x i32*>* %ptr @@ -716,10 +711,10 @@ define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) { ; X86-LABEL: masked_gather_v2i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vpgatherdq %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; @@ -763,10 +758,10 @@ define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) { ; X86-LABEL: masked_gather_v2double: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vgatherdpd %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovapd %xmm1, %xmm0 ; X86-NEXT: retl ; Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -1745,23 +1745,25 @@ ; NOVL: # %bb.0: ; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vpmovqd %zmm0, %ymm0 ; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto2f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto2f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; VLNODQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLNODQ-NEXT: retq %cmpres = fcmp ogt <2 x double> %a, zeroinitializer @@ -2030,29 +2032,42 @@ } define <2 x float> @ubto2f32(<2 x i32> %a) { -; ALL-LABEL: ubto2f32: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { -; ALL-LABEL: ubto2f64: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -2553,16 +2553,16 @@ ; GENERIC-LABEL: sbto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] -; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33] -; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x double> %a, zeroinitializer @@ -2908,19 +2908,15 @@ ; GENERIC-LABEL: ubto2f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> @@ -2931,20 +2927,16 @@ ; GENERIC-LABEL: ubto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer Index: test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -536,9 +536,7 @@ define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_16xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -547,10 +545,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -562,10 +558,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -576,10 +570,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -591,10 +583,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -605,10 +595,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -620,10 +608,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -634,10 +620,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -649,10 +633,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -258,9 +258,20 @@ } define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { -; ALL-LABEL: trunc_qd_128: -; ALL: ## %bb.0: -; ALL-NEXT: retq +; KNL-LABEL: trunc_qd_128: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qd_128: +; SKX: ## %bb.0: +; SKX-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %x = trunc <2 x i64> %i to <2 x i32> ret <2 x i32> %x } @@ -268,8 +279,10 @@ define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { ; KNL-LABEL: trunc_qd_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL-NEXT: vmovlps %xmm0, (%rdi) +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qd_128_mem: Index: test/CodeGen/X86/bitcast-and-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-and-setcc-128.ll +++ test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -491,130 +491,44 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: psrad $31, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movmskpd %xmm2, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -333,79 +333,32 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -254,18 +254,17 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) { ; SKX-LABEL: test13: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test13: ; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} @@ -279,18 +278,17 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SKX-LABEL: test14: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test14: ; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} Index: test/CodeGen/X86/cvtv2f32.ll =================================================================== --- test/CodeGen/X86/cvtv2f32.ll +++ test/CodeGen/X86/cvtv2f32.ll @@ -41,11 +41,9 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_buildvector_cvt: ; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X32-NEXT: movapd {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] -; X32-NEXT: orpd %xmm1, %xmm2 +; X32-NEXT: movdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] +; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X32-NEXT: por %xmm1, %xmm2 ; X32-NEXT: subpd %xmm1, %xmm2 ; X32-NEXT: cvtpd2ps %xmm2, %xmm1 ; X32-NEXT: mulps %xmm1, %xmm0 @@ -53,13 +51,13 @@ ; ; X64-LABEL: uitofp_2i32_buildvector_cvt: ; X64: # %bb.0: -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: movd %edi, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-NEXT: movdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] -; X64-NEXT: por %xmm1, %xmm2 -; X64-NEXT: subpd %xmm1, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm1 +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: pinsrd $1, %esi, %xmm1 +; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503600e+15,4.503600e+15] +; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: subpd %xmm2, %xmm1 +; X64-NEXT: cvtpd2ps %xmm1, %xmm1 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = insertelement <2 x i32> undef, i32 %x, i32 0 @@ -72,23 +70,21 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_legalized: ; X32: # %bb.0: -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X32-NEXT: movaps {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15] -; X32-NEXT: orps %xmm0, %xmm2 -; X32-NEXT: subpd %xmm0, %xmm2 -; X32-NEXT: cvtpd2ps %xmm2, %xmm0 +; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X32-NEXT: movdqa {{.*#+}} xmm2 = [4.503600e+15,4.503600e+15] +; X32-NEXT: por %xmm2, %xmm0 +; X32-NEXT: subpd %xmm2, %xmm0 +; X32-NEXT: cvtpd2ps %xmm0, %xmm0 ; X32-NEXT: mulps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: uitofp_2i32_legalized: ; X64: # %bb.0: -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X64-NEXT: movaps {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15] -; X64-NEXT: orps %xmm0, %xmm2 -; X64-NEXT: subpd %xmm0, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm0 +; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503600e+15,4.503600e+15] +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: subpd %xmm2, %xmm0 +; X64-NEXT: cvtpd2ps %xmm0, %xmm0 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = uitofp <2 x i32> %in to <2 x float> Index: test/CodeGen/X86/i64-to-float.ll =================================================================== --- test/CodeGen/X86/i64-to-float.ll +++ test/CodeGen/X86/i64-to-float.ll @@ -16,7 +16,7 @@ ; ; X32-AVX-LABEL: mask_sitofp_2i64_2f64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-AVX-NEXT: retl ; @@ -29,7 +29,7 @@ ; ; X64-AVX-LABEL: mask_sitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq %and = and <2 x i64> %a, @@ -47,7 +47,7 @@ ; ; X32-AVX-LABEL: mask_uitofp_2i64_2f64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-AVX-NEXT: retl ; @@ -60,7 +60,7 @@ ; ; X64-AVX-LABEL: mask_uitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq %and = and <2 x i64> %a, Index: test/CodeGen/X86/insertelement-shuffle.ll =================================================================== --- test/CodeGen/X86/insertelement-shuffle.ll +++ test/CodeGen/X86/insertelement-shuffle.ll @@ -46,18 +46,10 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X32_AVX256-LABEL: insert_subvector_512: ; X32_AVX256: # %bb.0: -; X32_AVX256-NEXT: pushl %ebp -; X32_AVX256-NEXT: movl %esp, %ebp -; X32_AVX256-NEXT: andl $-8, %esp -; X32_AVX256-NEXT: subl $8, %esp -; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32_AVX256-NEXT: vmovlps %xmm2, (%esp) ; X32_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X32_AVX256-NEXT: vpinsrd $0, (%esp), %xmm2, %xmm2 +; X32_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X32_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X32_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X32_AVX256-NEXT: movl %ebp, %esp -; X32_AVX256-NEXT: popl %ebp ; X32_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -10,8 +10,12 @@ ; ; X64-LABEL: signbits_sext_v2i64_sitofp_v2f64: ; X64: # %bb.0: -; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: vmovq %rcx, %xmm0 +; X64-NEXT: vmovq %rax, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq %1 = sext i32 %a0 to i64 @@ -243,8 +247,7 @@ ; X32-NEXT: vpsrad $16, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 ; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u] ; X32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-NEXT: retl ; @@ -253,8 +256,7 @@ ; X64-NEXT: vpsrad $16, %xmm0, %xmm1 ; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, Index: test/CodeGen/X86/lower-bitcast.ll =================================================================== --- test/CodeGen/X86/lower-bitcast.ll +++ test/CodeGen/X86/lower-bitcast.ll @@ -9,9 +9,7 @@ define double @test1(double %A) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test1: @@ -68,9 +66,7 @@ ; CHECK-LABEL: test4: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -915,13 +915,12 @@ ; KNL_64-LABEL: test17: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovapd %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -929,36 +928,31 @@ ; KNL_32-LABEL: test17: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test17: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovapd %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test17: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovapd %xmm2, %xmm0 ; SKX_32-NEXT: retl @@ -1080,8 +1074,8 @@ ; ; KNL_32-LABEL: test20: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1099,7 +1093,6 @@ ; ; SKX_32-LABEL: test20: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} @@ -1113,9 +1106,9 @@ ; KNL_64-LABEL: test21: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1124,10 +1117,10 @@ ; ; KNL_32-LABEL: test21: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1138,7 +1131,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq ; @@ -1146,8 +1138,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) @@ -1161,7 +1151,7 @@ ; KNL_64-LABEL: test22: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 @@ -1174,7 +1164,7 @@ ; KNL_32-LABEL: test22: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1187,7 +1177,6 @@ ; ; SKX-LABEL: test22: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} @@ -1196,7 +1185,6 @@ ; ; SKX_32-LABEL: test22: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1264,28 +1252,28 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1293,10 +1281,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23: @@ -1304,10 +1290,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1318,28 +1302,28 @@ define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23b: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23b: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1347,9 +1331,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23b: @@ -1357,9 +1340,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) @@ -1369,22 +1351,22 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test24: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1392,9 +1374,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test24: @@ -1402,9 +1383,8 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1416,13 +1396,12 @@ ; KNL_64-LABEL: test25: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1430,36 +1409,31 @@ ; KNL_32-LABEL: test25: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test25: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test25: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1472,11 +1446,10 @@ ; KNL_64-LABEL: test26: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1484,32 +1457,27 @@ ; KNL_32-LABEL: test26: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movb $3, %cl ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1} ; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test26: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1} ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1522,40 +1490,40 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test27: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} +; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test27: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} +; SKX_32-NEXT: vmovaps %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind @@ -1568,7 +1536,7 @@ ; KNL_64-LABEL: test28: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1577,8 +1545,8 @@ ; ; KNL_32-LABEL: test28: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movw $3, %ax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1587,7 +1555,6 @@ ; ; SKX-LABEL: test28: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq @@ -1596,8 +1563,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movb $3, %al ; SKX_32-NEXT: kmovw %eax, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) @@ -2666,28 +2631,26 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) { ; KNL_64-LABEL: test_scatter_2i32_index: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1} +; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_2i32_index: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1} +; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -2695,19 +2658,15 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX-NEXT: vpsraq $32, %xmm1, %xmm1 -; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1} +; SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_2i32_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1} +; SKX_32-NEXT: vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1} ; SKX_32-NEXT: retl %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -515,30 +515,20 @@ } define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { -; AVX1-LABEL: test14: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: test14: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: test14: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: test14: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} @@ -547,9 +537,9 @@ ; ; SKX-LABEL: test14: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -561,41 +551,38 @@ ; AVX1-LABEL: test15: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test15: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test15: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test15: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) @@ -603,32 +590,21 @@ } define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { -; AVX1-LABEL: test16: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test16: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: test16: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: test16: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} @@ -638,9 +614,9 @@ ; ; SKX-LABEL: test16: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -652,48 +628,41 @@ ; AVX1-LABEL: test17: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test17: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test17: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test17: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} -; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) @@ -701,29 +670,19 @@ } define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { -; AVX1-LABEL: test18: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test18: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: test18: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: test18: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} @@ -733,9 +692,9 @@ ; ; SKX-LABEL: test18: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer Index: test/CodeGen/X86/mmx-arith.ll =================================================================== --- test/CodeGen/X86/mmx-arith.ll +++ test/CodeGen/X86/mmx-arith.ll @@ -201,84 +201,56 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) { ; X32-LABEL: test1: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: paddq %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: movdqa %xmm1, %xmm2 -; X32-NEXT: psrlq $32, %xmm2 +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: paddd %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X32-NEXT: pmuludq %xmm0, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-NEXT: pmuludq %xmm0, %xmm2 -; X32-NEXT: movdqa %xmm0, %xmm3 -; X32-NEXT: psrlq $32, %xmm3 -; X32-NEXT: pmuludq %xmm1, %xmm3 -; X32-NEXT: paddq %xmm2, %xmm3 -; X32-NEXT: psllq $32, %xmm3 -; X32-NEXT: pmuludq %xmm1, %xmm0 -; X32-NEXT: paddq %xmm3, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: andps %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: orps %xmm1, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: xorps %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pand %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: por %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pxor %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%ecx) ; X32-NEXT: emms ; X32-NEXT: retl ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: paddq %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psrlq $32, %xmm3 -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: paddq %xmm2, %xmm3 -; X64-NEXT: psllq $32, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-NEXT: pmuludq %xmm0, %xmm1 -; X64-NEXT: paddq %xmm3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movq %xmm1, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: pand %xmm1, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: movq %xmm1, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: pxor %xmm1, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq @@ -563,45 +535,39 @@ ; X32-LABEL: test3: ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $16, %esp -; X32-NEXT: cmpl $0, 16(%ebp) +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: je .LBB3_1 ; X32-NEXT: # %bb.2: # %bb26.preheader +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: xorl %ebx, %ebx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_3: # %bb26 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl 8(%ebp), %ecx -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl (%ecx,%ebx,8), %ecx -; X32-NEXT: movl 4(%esi,%ebx,8), %esi -; X32-NEXT: movl 12(%ebp), %edi -; X32-NEXT: addl (%edi,%ebx,8), %ecx -; X32-NEXT: adcl 4(%edi,%ebx,8), %esi -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: movl %ecx, (%esp) -; X32-NEXT: adcl %edx, %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movd %xmm0, %eax -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X32-NEXT: movl (%edi,%ebx,8), %ebp +; X32-NEXT: movl 4(%edi,%ebx,8), %ecx +; X32-NEXT: addl (%esi,%ebx,8), %ebp +; X32-NEXT: adcl 4(%esi,%ebx,8), %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %edx, %ecx +; X32-NEXT: movd %ecx, %xmm0 +; X32-NEXT: movd %eax, %xmm1 +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X32-NEXT: movd %xmm0, %edx ; X32-NEXT: incl %ebx -; X32-NEXT: cmpl 16(%ebp), %ebx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: jb .LBB3_3 ; X32-NEXT: jmp .LBB3_4 ; X32-NEXT: .LBB3_1: ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .LBB3_4: # %bb31 -; X32-NEXT: leal -12(%ebp), %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx Index: test/CodeGen/X86/mmx-cvt.ll =================================================================== --- test/CodeGen/X86/mmx-cvt.ll +++ test/CodeGen/X86/mmx-cvt.ll @@ -296,8 +296,8 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $32, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movq (%eax), %mm0 ; X86-NEXT: paddd %mm0, %mm0 Index: test/CodeGen/X86/mulvi32.ll =================================================================== --- test/CodeGen/X86/mulvi32.ll +++ test/CodeGen/X86/mulvi32.ll @@ -7,58 +7,39 @@ ; PR6399 define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) { -; SSE-LABEL: _mul2xi32a: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _mul2xi32a: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul2xi32a: +; SSE42: # %bb.0: +; SSE42-NEXT: pmulld %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX-LABEL: _mul2xi32a: ; AVX: # %bb.0: -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = mul <2 x i32> %0, %1 ret <2 x i32> %r } define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) { -; SSE2-LABEL: _mul2xi32b: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: retq -; -; SSE42-LABEL: _mul2xi32b: -; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE42-NEXT: pmuludq %xmm0, %xmm1 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: retq +; SSE-LABEL: _mul2xi32b: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: _mul2xi32b: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %factor0 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> %factor1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -88,42 +88,30 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind { ; SSE2-LABEL: v3i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, 8(%rdi) -; SSE2-NEXT: movq %xmm2, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %xmm2, 8(%rdi) +; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v3i32: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi) -; SSE42-NEXT: movq %xmm1, (%rdi) +; SSE42-NEXT: extractps $1, %xmm0, 8(%rdi) +; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE42-NEXT: movlps %xmm0, (%rdi) ; SSE42-NEXT: retq ; -; AVX1-LABEL: v3i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-NEXT: vextractps $2, %xmm0, 8(%rdi) -; AVX1-NEXT: vmovlps %xmm1, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v3i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vextractps $2, %xmm0, 8(%rdi) -; AVX2-NEXT: vmovlps %xmm1, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi) +; AVX-NEXT: vmovlps %xmm1, (%rdi) +; AVX-NEXT: retq ; ; XOP-LABEL: v3i32: ; XOP: # %bb.0: -; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; XOP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi) +; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; XOP-NEXT: vextractps $1, %xmm0, 8(%rdi) ; XOP-NEXT: vmovlps %xmm1, (%rdi) ; XOP-NEXT: retq %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> Index: test/CodeGen/X86/pointer-vector.ll =================================================================== --- test/CodeGen/X86/pointer-vector.ll +++ test/CodeGen/X86/pointer-vector.ll @@ -117,7 +117,7 @@ ; CHECK-LABEL: BITCAST1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl entry: %G = load <2 x i8*>, <2 x i8*>* %p Index: test/CodeGen/X86/ret-mmx.ll =================================================================== --- test/CodeGen/X86/ret-mmx.ll +++ test/CodeGen/X86/ret-mmx.ll @@ -33,7 +33,7 @@ ; CHECK-LABEL: t3: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retq ret <2 x i32> } Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -1075,36 +1075,84 @@ ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psadbw %xmm3, %xmm2 -; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_2i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB3_1: # %vector.body -; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX-NEXT: addq $4, %rax -; AVX-NEXT: jne .LBB3_1 -; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_2i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB3_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: addq $4, %rax +; AVX1-NEXT: jne .LBB3_1 +; AVX1-NEXT: # %bb.2: # %middle.block +; AVX1-NEXT: vphaddd %xmm1, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_2i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB3_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: addq $4, %rax +; AVX2-NEXT: jne .LBB3_1 +; AVX2-NEXT: # %bb.2: # %middle.block +; AVX2-NEXT: vphaddd %xmm1, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_2i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB3_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX512-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: addq $4, %rax +; AVX512-NEXT: jne .LBB3_1 +; AVX512-NEXT: # %bb.2: # %middle.block +; AVX512-NEXT: vphaddd %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: br label %vector.body Index: test/CodeGen/X86/scalar_widen_div.ll =================================================================== --- test/CodeGen/X86/scalar_widen_div.ll +++ test/CodeGen/X86/scalar_widen_div.ll @@ -13,20 +13,19 @@ ; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: pmovsxdq (%rdi,%rcx,8), %xmm0 -; CHECK-NEXT: pmovsxdq (%rsi,%rcx,8), %xmm1 -; CHECK-NEXT: pextrq $1, %xmm0, %rax -; CHECK-NEXT: pextrq $1, %xmm1, %rsi -; CHECK-NEXT: cqto -; CHECK-NEXT: idivq %rsi -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: movq %xmm1, %rsi -; CHECK-NEXT: cqto -; CHECK-NEXT: idivq %rsi -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: pextrd $1, %xmm0, %eax +; CHECK-NEXT: pextrd $1, %xmm1, %esi +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %esi +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movd %xmm1, %edi +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %edi +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: pinsrd $1, %esi, %xmm0 ; CHECK-NEXT: movq %xmm0, (%r8,%rcx,8) ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -26,32 +26,42 @@ ; X86-SSE-NEXT: movl c, %esi ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx ; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %eax, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X86-SSE-NEXT: movq %xmm2, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movzbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8: @@ -59,23 +69,29 @@ ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm1 -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: movd %ecx, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X64-SSE-NEXT: movq %xmm2, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movzbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -953,19 +969,28 @@ ; ; X86-AVX-LABEL: mul_2xi8_sext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movsbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movsbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_sext: @@ -988,10 +1013,15 @@ ; X64-AVX-LABEL: mul_2xi8_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movsbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1045,19 +1075,28 @@ ; ; X86-AVX-LABEL: mul_2xi8_sext_zext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_sext_zext: @@ -1081,10 +1120,15 @@ ; X64-AVX-LABEL: mul_2xi8_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1132,19 +1176,28 @@ ; ; X86-AVX-LABEL: mul_2xi16_sext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movswl 2(%eax,%ecx), %edx +; X86-AVX-NEXT: movswl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_sext: @@ -1162,10 +1215,15 @@ ; X64-AVX-LABEL: mul_2xi16_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movswl 2(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1204,43 +1262,42 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: paddq %xmm2, %xmm3 -; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: paddq %xmm3, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_sext_zext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_sext_zext: @@ -1249,34 +1306,29 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: psrlq $32, %xmm3 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE-NEXT: paddq %xmm2, %xmm3 -; X64-SSE-NEXT: psllq $32, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: paddq %xmm3, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1477,20 +1529,26 @@ ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst1: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst1: @@ -1500,20 +1558,19 @@ ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: movl $255, %ecx -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1552,13 +1609,19 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst2: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst2: @@ -1577,9 +1640,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1610,23 +1675,26 @@ ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst3: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst3: @@ -1636,23 +1704,19 @@ ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1693,13 +1757,19 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst4: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst4: @@ -1720,9 +1790,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1763,13 +1835,19 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst5: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst5: @@ -1790,9 +1868,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst5: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1833,13 +1913,19 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst6: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst6: @@ -1860,9 +1946,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst6: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1905,9 +1993,7 @@ ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1928,12 +2014,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1971,13 +2052,19 @@ ; ; X86-AVX-LABEL: mul_2xi16_varconst2: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst2: @@ -1995,9 +2082,11 @@ ; X64-AVX-LABEL: mul_2xi16_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2027,15 +2116,14 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -2046,9 +2134,7 @@ ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -2058,17 +2144,14 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-SSE-NEXT: movq %rcx, %xmm1 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -2077,12 +2160,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2112,27 +2190,32 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst4: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst4: @@ -2141,29 +2224,25 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-SSE-NEXT: movq %rcx, %xmm1 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: Index: test/CodeGen/X86/shuffle-strided-with-offset-128.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -144,29 +144,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S Index: test/CodeGen/X86/shuffle-vs-trunc-128.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -247,29 +247,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S @@ -283,16 +265,36 @@ ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v2i64_to_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v2i64_to_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vmovlps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovlps %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v2i64_to_v2i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovlps %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_to_v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_to_v2i32: @@ -303,8 +305,10 @@ ; ; AVX512BW-LABEL: trunc_v2i64_to_v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: Index: test/CodeGen/X86/sse-fsignum.ll =================================================================== --- test/CodeGen/X86/sse-fsignum.ll +++ test/CodeGen/X86/sse-fsignum.ll @@ -33,19 +33,51 @@ } define void @signum64a(<2 x double>*) { -; AVX-LABEL: signum64a: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd (%rdi), %xmm0 -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovapd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: signum64a: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd (%rdi), %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovapd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: signum64a: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovapd (%rdi), %xmm0 +; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovapd %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: signum64a: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovapd (%rdi), %xmm0 +; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vmovapd %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq entry: %1 = load <2 x double>, <2 x double>* %0 %2 = fcmp olt <2 x double> %1, zeroinitializer Index: test/CodeGen/X86/trunc-ext-ld-st.ll =================================================================== --- test/CodeGen/X86/trunc-ext-ld-st.ll +++ test/CodeGen/X86/trunc-ext-ld-st.ll @@ -61,22 +61,12 @@ } define void @load_2_i32(<2 x i32>* %A) { -; SSE2-LABEL: load_2_i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: load_2_i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq +; CHECK-LABEL: load_2_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdi) +; CHECK-NEXT: retq %T = load <2 x i32>, <2 x i32>* %A %G = add <2 x i32> %T, store <2 x i32> %G, <2 x i32>* %A Index: test/CodeGen/X86/trunc-subvector.ll =================================================================== --- test/CodeGen/X86/trunc-subvector.ll +++ test/CodeGen/X86/trunc-subvector.ll @@ -40,26 +40,14 @@ define <2 x i32> @test3(<8 x i32> %v) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX2-LABEL: test3: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test3: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test3: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -69,24 +57,13 @@ define <2 x i32> @test4(<8 x i32> %v) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test4: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -96,14 +73,8 @@ define <2 x i32> @test5(<8 x i32> %v) { ; SSE2-LABEL: test5: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test5: @@ -112,7 +83,8 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,2,4,6,4,6,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -120,9 +92,11 @@ ; AVX512-LABEL: test5: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> @@ -167,25 +141,14 @@ define <2 x i32> @test8(<8 x i32> %v) { ; SSE2-LABEL: test8: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX2-LABEL: test8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test8: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -195,23 +158,13 @@ define <2 x i32> @test9(<8 x i32> %v) { ; SSE2-LABEL: test9: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test9: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test9: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test9: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -221,19 +174,14 @@ define <2 x i32> @test10(<8 x i32> %v) { ; SSE2-LABEL: test10: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <3,4,u,u,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -241,9 +189,11 @@ ; AVX512-LABEL: test10: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> Index: test/CodeGen/X86/vec_cast3.ll =================================================================== --- test/CodeGen/X86/vec_cast3.ll +++ test/CodeGen/X86/vec_cast3.ll @@ -5,9 +5,9 @@ define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2i8_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpsllq $56, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 ; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -23,9 +23,9 @@ define <2 x float> @cvt_v2i16_v2f32(<2 x i16> %src) { ; CHECK-LABEL: cvt_v2i16_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpsllq $48, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpslld $16, %xmm0, %xmm0 ; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -41,7 +41,6 @@ define <2 x float> @cvt_v2i32_v2f32(<2 x i32> %src) { ; CHECK-LABEL: cvt_v2i32_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -56,7 +55,7 @@ define <2 x float> @cvt_v2u8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2u8_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -72,7 +71,9 @@ define <2 x float> @cvt_v2u16_v2f32(<2 x i16> %src) { ; CHECK-LABEL: cvt_v2u16_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,xmm0[10,11],zero,zero +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -88,10 +89,9 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) { ; CHECK-LABEL: cvt_v2u32_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: retl @@ -172,7 +172,6 @@ ; CHECK-LABEL: cvt_v2f32_v2i32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i32: @@ -284,34 +283,22 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: subl $68, %esp ; CHECK-NEXT: .cfi_def_cfa_offset 72 -; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vcmpltss %xmm2, %xmm1, %xmm3 -; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm3 -; CHECK-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; CHECK-NEXT: vcmpltss %xmm2, %xmm0, %xmm3 -; CHECK-NEXT: vsubss %xmm2, %xmm0, %xmm4 -; CHECK-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm3 -; CHECK-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ; CHECK-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-NEXT: fisttpll (%esp) +; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp) ; CHECK-NEXT: flds {{[0-9]+}}(%esp) ; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vucomiss %xmm2, %xmm1 -; CHECK-NEXT: setae %al -; CHECK-NEXT: shll $31, %eax -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vucomiss %xmm2, %xmm0 -; CHECK-NEXT: setae %cl -; CHECK-NEXT: shll $31, %ecx -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fisttpll (%esp) ; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $3, (%esp), %xmm0, %xmm0 ; CHECK-NEXT: addl $68, %esp ; CHECK-NEXT: retl ; Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -111,28 +111,32 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promtz: ; CHECK: # %bb.0: -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: psubq %xmm0, %xmm2 +; CHECK-NEXT: psubd %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddd %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrld $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubd %xmm0, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] ; CHECK-NEXT: movdqa %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm3 +; CHECK-NEXT: psrld $2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddd %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: psrld $4, %xmm0 +; CHECK-NEXT: paddd %xmm3, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -141,44 +145,44 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promlz: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $1, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $2, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $4, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $8, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $16, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrld $2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $4, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrld $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrld $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm2 +; CHECK-NEXT: psubd %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm0, %xmm1 +; CHECK-NEXT: psrld $2, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddq %xmm3, %xmm2 +; CHECK-NEXT: paddd %xmm1, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: psrld $4, %xmm0 +; CHECK-NEXT: paddd %xmm2, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: psadbw %xmm1, %xmm0 -; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -188,23 +192,27 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind { ; CHECK-LABEL: prompop: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: psubd %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrld $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: paddd %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: psrld $4, %xmm1 +; CHECK-NEXT: paddd %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psadbw %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: psadbw %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: packuswb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) Index: test/CodeGen/X86/vec_extract-mmx.ll =================================================================== --- test/CodeGen/X86/vec_extract-mmx.ll +++ test/CodeGen/X86/vec_extract-mmx.ll @@ -125,12 +125,10 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %ebp ; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $32, %esp ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -138,9 +136,7 @@ ; X64-LABEL: test4: ; X64: # %bb.0: ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq %tmp0 = bitcast x86_mmx %a to <2 x i32> %tmp1 = extractelement <2 x i32> %tmp0, i32 1 Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -93,13 +93,11 @@ ; SSE-LABEL: fptosi_2f64_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> ret <2 x i32> %cvt @@ -338,52 +336,23 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_4i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_4i32: @@ -419,50 +388,25 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: subsd %xmm1, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm1, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_2i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; VEX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i32: @@ -498,34 +442,17 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovbq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_4f64_to_2i32: @@ -752,46 +679,20 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_4f64_to_4i32: @@ -849,13 +750,11 @@ ; SSE-LABEL: fptosi_2f32_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f32_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %cvt @@ -1234,77 +1133,64 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f32_to_2i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm1, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vmovd %ecx, %xmm1 +; VEX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; VEX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; VEX-NEXT: vcvttss2si %xmm2, %rax +; VEX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VLDQ-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %cvt @@ -2242,7 +2128,8 @@ ; SSE-LABEL: fptosi_2f16_to_4i32: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rax -; SSE-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee @@ -2252,20 +2139,20 @@ ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f16_to_4i32: ; VEX: # %bb.0: ; VEX-NEXT: pushq %rax -; VEX-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill +; VEX-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; VEX-NEXT: vmovaps %xmm1, %xmm0 ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee @@ -2275,27 +2162,27 @@ ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vcvttss2si (%rsp), %rax # 4-byte Folded Reload -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; VEX-NEXT: vcvttss2si %xmm0, %eax +; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vmovd %eax, %xmm1 +; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: popq %rax ; VEX-NEXT: retq ; ; AVX512-LABEL: fptosi_2f16_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm1, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2si %xmm0, %eax +; AVX512-NEXT: vcvttss2si %xmm1, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> @@ -2312,32 +2199,31 @@ ; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f80_to_4i32: ; AVX: # %bb.0: ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq %cvt = fptosi <2 x x86_fp80> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> @@ -2347,51 +2233,44 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; SSE-LABEL: fptosi_2f128_to_4i32: ; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: movq %rdi, %rbx -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movl %eax, %ebp ; SSE-NEXT: movq %rbx, %rdi ; SSE-NEXT: movq %r14, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f128_to_4i32: ; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: movq %rsi, %r14 -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: movl %eax, %ebp ; AVX-NEXT: movq %rbx, %rdi ; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebp, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %rbp ; AVX-NEXT: retq %cvt = fptosi <2 x fp128> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -17,11 +17,9 @@ ; ; X64-LABEL: t1: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $12, %edi -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq %tmp12 = shl i32 %a, 12 Index: test/CodeGen/X86/vec_insert-7.ll =================================================================== --- test/CodeGen/X86/vec_insert-7.ll +++ test/CodeGen/X86/vec_insert-7.ll @@ -8,18 +8,27 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X32-LABEL: mmx_movzl: ; X32: ## %bb.0: -; X32-NEXT: subl $20, %esp +; X32-NEXT: subl $44, %esp +; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) +; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: movl $32, %eax -; X32-NEXT: movd %eax, %xmm0 -; X32-NEXT: movq %xmm0, (%esp) +; X32-NEXT: pinsrd $0, %eax, %xmm0 +; X32-NEXT: pxor %xmm1, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; X32-NEXT: movdqa %xmm1, (%esp) ; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: addl $20, %esp +; X32-NEXT: addl $44, %esp ; X32-NEXT: retl ; ; X64-LABEL: mmx_movzl: ; X64: ## %bb.0: +; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 ; X64-NEXT: movl $32, %eax -; X64-NEXT: movq %rax, %xmm0 +; X64-NEXT: pinsrd $0, %eax, %xmm1 +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; X64-NEXT: retq %tmp = bitcast x86_mmx %x to <2 x i32> %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -13,10 +13,8 @@ ; ; X64-LABEL: t0: ; X64: ## %bb.0: -; X64-NEXT: ## kill: def $edi killed $edi def $rdi -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X64-NEXT: retq %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -2652,8 +2652,10 @@ ; ; AVX-LABEL: sitofp_load_2i16_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movswl 2(%rdi), %eax +; AVX-NEXT: movswl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i16>, <2 x i16> *%a @@ -2674,8 +2676,10 @@ ; ; AVX-LABEL: sitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a @@ -3010,8 +3014,10 @@ ; ; AVX-LABEL: uitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a Index: test/CodeGen/X86/vec_zero_cse.ll =================================================================== --- test/CodeGen/X86/vec_zero_cse.ll +++ test/CodeGen/X86/vec_zero_cse.ll @@ -22,7 +22,8 @@ ; X64-LABEL: test1: ; X64: # %bb.0: ; X64-NEXT: movq $0, {{.*}}(%rip) -; X64-NEXT: movq $0, {{.*}}(%rip) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movlps %xmm0, {{.*}}(%rip) ; X64-NEXT: retq store <1 x i64> zeroinitializer, <1 x i64>* @M1 store <2 x i32> zeroinitializer, <2 x i32>* @M2 @@ -41,8 +42,8 @@ ; X64-LABEL: test2: ; X64: # %bb.0: ; X64-NEXT: movq $-1, {{.*}}(%rip) -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq %rax, {{.*}}(%rip) +; X64-NEXT: pcmpeqd %xmm0, %xmm0 +; X64-NEXT: movq %xmm0, {{.*}}(%rip) ; X64-NEXT: retq store <1 x i64> < i64 -1 >, <1 x i64>* @M1 store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2 Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -5051,8 +5051,7 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_2i8_to_2i32: @@ -5061,27 +5060,35 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movsbl 1(%rdi), %eax +; SSE41-NEXT: movsbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X32-SSE41-LABEL: sext_2i8_to_2i32: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 -; X32-SSE41-NEXT: paddq %xmm0, %xmm0 +; X32-SSE41-NEXT: movsbl 1(%eax), %ecx +; X32-SSE41-NEXT: movsbl (%eax), %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: paddd %xmm0, %xmm0 ; X32-SSE41-NEXT: retl %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = sext <2 x i8> %x to <2 x i32> Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -1563,15 +1563,73 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; -; AVX-LABEL: trunc2x2i64_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: retq +; AVX1-LABEL: trunc2x2i64_4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: retq ; -; AVX512-LABEL: trunc2x2i64_4i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512-NEXT: retq +; AVX2-SLOW-LABEL: trunc2x2i64_4i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x2i64_4i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2x2i64_4i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x2i64_4i32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x2i64_4i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x2i64_4i32: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq entry: %0 = trunc <2 x i64> %a to <2 x i32> %1 = trunc <2 x i64> %b to <2 x i32> @@ -1586,34 +1644,60 @@ ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; -; AVX-LABEL: trunc2i64_i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: trunc2i64_i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2i64_i64: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovq %xmm0, %rax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2i64_i64: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovq %xmm0, %rax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc2i64_i64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc2i64_i64: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc2i64_i64: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2i64_i64: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) -; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <2 x i64> %inval to <2 x i32> Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -2271,28 +2271,35 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_2i8_to_2i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[3],zero,zero,zero -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: zext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = zext <2 x i8> %x to <2 x i32> Index: test/CodeGen/X86/vshift-4.ll =================================================================== --- test/CodeGen/X86/vshift-4.ll +++ test/CodeGen/X86/vshift-4.ll @@ -58,7 +58,7 @@ ; X32-LABEL: shift2a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -67,7 +67,7 @@ ; ; X64-LABEL: shift2a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -84,7 +84,7 @@ ; X32-LABEL: shift2b: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -93,7 +93,7 @@ ; ; X64-LABEL: shift2b: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -110,7 +110,7 @@ ; X32-LABEL: shift2c: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -119,7 +119,7 @@ ; ; X64-LABEL: shift2c: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 Index: test/CodeGen/X86/widen_arith-3.ll =================================================================== --- test/CodeGen/X86/widen_arith-3.ll +++ test/CodeGen/X86/widen_arith-3.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $40, %esp +; CHECK-NEXT: subl $24, %esp ; CHECK-NEXT: movl {{\.LCPI.*}}, %eax ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 Index: test/CodeGen/X86/widen_cast-5.ll =================================================================== --- test/CodeGen/X86/widen_cast-5.ll +++ test/CodeGen/X86/widen_cast-5.ll @@ -8,18 +8,15 @@ ; X86-LABEL: convert: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; X86-NEXT: pxor LCPI0_0, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: xorps LCPI0_0, %xmm0 +; X86-NEXT: movlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movq %rsi, %xmm0 -; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: pxor {{.*}}(%rip), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/widen_conv-1.ll =================================================================== --- test/CodeGen/X86/widen_conv-1.ll +++ test/CodeGen/X86/widen_conv-1.ll @@ -8,16 +8,17 @@ ; X86-LABEL: convert_v2i64_to_v2i32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i64_to_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: psubd %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -66,20 +67,14 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind { ; X86-LABEL: convert_v5i16_to_v5i8: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqa (%ecx), %xmm0 ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubw %xmm1, %xmm0 ; X86-NEXT: pextrb $8, %xmm0, 4(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; X86-NEXT: movd %xmm0, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: convert_v5i16_to_v5i8: Index: test/CodeGen/X86/widen_conv-2.ll =================================================================== --- test/CodeGen/X86/widen_conv-2.ll +++ test/CodeGen/X86/widen_conv-2.ll @@ -7,18 +7,18 @@ define void @convert_v2i16_v2i32(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind { ; X86-LABEL: convert_v2i16_v2i32: ; X86: # %bb.0: # %entry +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: psllq $48, %xmm0 +; X86-NEXT: pslld $16, %xmm0 ; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i16_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/widen_conv-3.ll =================================================================== --- test/CodeGen/X86/widen_conv-3.ll +++ test/CodeGen/X86/widen_conv-3.ll @@ -9,10 +9,10 @@ define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind { ; X86-SSE2-LABEL: convert_v2i16_to_v2f32: ; X86-SSE2: # %bb.0: # %entry +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: psllq $48, %xmm0 +; X86-SSE2-NEXT: pslld $16, %xmm0 ; X86-SSE2-NEXT: psrad $16, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -21,10 +21,10 @@ ; ; X86-SSE42-LABEL: convert_v2i16_to_v2f32: ; X86-SSE42: # %bb.0: # %entry +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE42-NEXT: psllq $48, %xmm0 +; X86-SSE42-NEXT: pslld $16, %xmm0 ; X86-SSE42-NEXT: psrad $16, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) ; X86-SSE42-NEXT: movss %xmm0, (%eax) @@ -32,9 +32,9 @@ ; ; X64-LABEL: convert_v2i16_to_v2f32: ; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-NEXT: movlps %xmm0, (%rdi) ; X64-NEXT: retq Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -151,7 +151,7 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %edx Index: test/CodeGen/X86/widened-broadcast.ll =================================================================== --- test/CodeGen/X86/widened-broadcast.ll +++ test/CodeGen/X86/widened-broadcast.ll @@ -597,22 +597,10 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_8i32_2i32_0101: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8i32_2i32_0101: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8i32_2i32_0101: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8i32_2i32_0101: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -630,9 +618,7 @@ ; ; AVX1-LABEL: load_splat_16i32_2i32_0101: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX1-NEXT: vmovaps %ymm0, %ymm1 ; AVX1-NEXT: retq ; @@ -644,9 +630,7 @@ ; ; AVX512-LABEL: load_splat_16i32_2i32_0101: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 ; AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> Index: test/CodeGen/X86/x86-shifts.ll =================================================================== --- test/CodeGen/X86/x86-shifts.ll +++ test/CodeGen/X86/x86-shifts.ll @@ -254,16 +254,16 @@ ; X32-LABEL: shl2_other: ; X32: # %bb.0: # %entry ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psllq $2, %xmm1 -; X32-NEXT: psllq $9, %xmm0 +; X32-NEXT: pslld $2, %xmm1 +; X32-NEXT: pslld $9, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shl2_other: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psllq $2, %xmm1 -; X64-NEXT: psllq $9, %xmm0 +; X64-NEXT: pslld $2, %xmm1 +; X64-NEXT: pslld $9, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: @@ -276,19 +276,17 @@ define <2 x i32> @shr2_other(<2 x i32> %A) nounwind { ; X32-LABEL: shr2_other: ; X32: # %bb.0: # %entry -; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psrlq $8, %xmm1 -; X32-NEXT: psrlq $1, %xmm0 +; X32-NEXT: psrld $8, %xmm1 +; X32-NEXT: psrld $1, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shr2_other: ; X64: # %bb.0: # %entry -; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $8, %xmm1 -; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: psrld $8, %xmm1 +; X64-NEXT: psrld $1, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -342,18 +342,30 @@ ; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 ; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0 -; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0 -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]] -; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 -; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2 -; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] +; ZEROTHRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] +; ZEROTHRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 +; ZEROTHRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 +; ZEROTHRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 +; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 +; ZEROTHRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 +; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 ; ZEROTHRESH-NEXT: ret <4 x float> [[RD]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -430,18 +442,12 @@ ; CHECK-NEXT: ret <2 x float> [[RB]] ; ; ZEROTHRESH-LABEL: @simple_select_v2( -; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1 -; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1 -; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]] +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1 ; ZEROTHRESH-NEXT: ret <2 x float> [[RB]] ; %c0 = extractelement <2 x i32> %c, i32 0