Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -816,13 +816,6 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Provide custom widening for v2f32 setcc. This is really for VLX when - // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to - // type legalization changing the result type to v4i1 during widening. - // It works fine for SSE2 and is probably faster so no need to qualify with - // VLX support. - setOperationAction(ISD::SETCC, MVT::v2i32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); @@ -905,7 +898,9 @@ // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -1815,6 +1810,9 @@ TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT == MVT::v2i32 && Subtarget.hasSSE2()) + return TypeWidenVector; + if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; @@ -19001,11 +18999,6 @@ assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); - // This is being called by type legalization because v2i32 is marked custom - // for result type legalization for v2f32. - if (VTOp0 == MVT::v2i32) - return SDValue(); - // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && @@ -19940,7 +19933,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - StoreSDNode *St = cast(Op.getNode()); + auto *St = cast(Op.getNode()); SDLoc dl(St); SDValue StoredVal = St->getValue(); @@ -19966,14 +19959,20 @@ if (St->isTruncatingStore()) return SDValue(); - assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT"); + MVT StoreVT = StoredVal.getSimpleValueType(); + assert((StoreVT == MVT::v2i32 || StoreVT == MVT::v2f32) && "Unexpected VT"); - // Widen the vector, cast to a v2x64 type, extract the single 64-bit + // Widen the vector, cast to a v2x64 type, extract the single element 64-bit // element and store it. - StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal, - DAG.getUNDEF(MVT::v2f32)); - StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal); - StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal, + MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), + StoreVT.getVectorNumElements() * 2); + StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, + DAG.getUNDEF(StoreVT)); + MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 + : MVT::f64; + MVT CastVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(CastVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), @@ -26027,26 +26026,6 @@ Results.push_back(Res); return; } - case ISD::SETCC: { - // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when - // setCC result type is v2i1 because type legalzation will end up with - // a v4i1 setcc plus an extend. - assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); - if (N->getOperand(0).getValueType() != MVT::v2f32 || - getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector) - return; - SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); - SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(0), UNDEF); - SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(1), UNDEF); - SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, - N->getOperand(2)); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -26095,35 +26074,11 @@ assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - bool Widenv2i32 = - getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector; if (Src.getValueType() == MVT::v2f64) { + if (!IsSigned && !Subtarget.hasVLX()) + return; unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; - if (!IsSigned && !Subtarget.hasVLX()) { - // If v2i32 is widened, we can defer to the generic legalizer. - if (Widenv2i32) - return; - // Custom widen by doubling to a legal vector with. Isel will - // further widen to v8f64. - Opc = ISD::FP_TO_UINT; - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, - Src, DAG.getUNDEF(MVT::v2f64)); - } SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); - if (!Widenv2i32) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } - if (SrcVT == MVT::v2f32 && - getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) { - SDValue Idx = DAG.getIntPtrConstant(0, dl); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32)); - Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT - : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -26418,75 +26373,54 @@ Results.push_back(Res.getValue(2)); return; } - if (VT == MVT::v2i32) { + if (VT == MVT::v2i32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); + if (Index.getValueType() != MVT::v2i64) + return; SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Gather->getPassThru(), DAG.getUNDEF(MVT::v2i32)); - // If the index is v2i64 we can use it directly. - if (Index.getValueType() == MVT::v2i64 && - (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { - if (!Subtarget.hasVLX()) { - // We need to widen the mask, but the instruction will only use 2 - // of its elements. So we can use undef. - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getUNDEF(MVT::v2i1)); - Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); - } - SDValue Ops[] = { Gather->getChain(), PassThru, Mask, - Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getTargetMemSDNode( - DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, - Gather->getMemoryVT(), Gather->getMemOperand()); - SDValue Chain = Res.getValue(2); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } - EVT IndexVT = Index.getValueType(); - EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), - IndexVT.getScalarType(), 4); - // Otherwise we need to custom widen everything to avoid promotion. - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(IndexVT)); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), - Gather->getMemoryVT(), dl, Ops, - Gather->getMemOperand()); - SDValue Chain = Res.getValue(1); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); + SDValue Res = DAG.getTargetMemSDNode( + DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, + Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(2)); return; } break; } case ISD::LOAD: { - // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids - // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast - // since type legalization will try to use an i64 load. - assert(N->getValueType(0) == MVT::v2f32 && "Unexpected VT"); + // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This + // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp + // cast since type legalization will try to use an i64 load. + MVT VT = N->getSimpleValueType(0); + assert((VT == MVT::v2f32 || VT == MVT::v2i32) && "Unexpected VT"); if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); - SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(), + MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res); - Res = DAG.getBitcast(MVT::v4f32, Res); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); Results.push_back(Res); Results.push_back(Chain); return; Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -807,7 +807,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. - // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); // For Broadcasts we are splatting the first element from the first input Index: test/Analysis/CostModel/X86/alternate-shuffle-cost.ll =================================================================== --- test/Analysis/CostModel/X86/alternate-shuffle-cost.ll +++ test/Analysis/CostModel/X86/alternate-shuffle-cost.ll @@ -18,9 +18,21 @@ ; 64-bit packed float vectors (v2f32) are widened to type v4f32. define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) { -; CHECK-LABEL: 'test_v2i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; SSE2-LABEL: 'test_v2i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSSE3-LABEL: 'test_v2i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSE42-LABEL: 'test_v2i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; AVX-LABEL: 'test_v2i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 ; ; BTVER2-LABEL: 'test_v2i32' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> @@ -56,9 +68,21 @@ } define <2 x i32> @test_v2i32_2(<2 x i32> %a, <2 x i32> %b) { -; CHECK-LABEL: 'test_v2i32_2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; SSE2-LABEL: 'test_v2i32_2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSSE3-LABEL: 'test_v2i32_2' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSE42-LABEL: 'test_v2i32_2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; AVX-LABEL: 'test_v2i32_2' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 ; ; BTVER2-LABEL: 'test_v2i32_2' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> Index: test/Analysis/CostModel/X86/arith.ll =================================================================== --- test/Analysis/CostModel/X86/arith.ll +++ test/Analysis/CostModel/X86/arith.ll @@ -1150,36 +1150,32 @@ ; A <2 x i64> vector multiply is implemented using ; 3 PMULUDQ and 2 PADDS and 4 shifts. define void @mul_2i32() { -; SSE-LABEL: 'mul_2i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-LABEL: 'mul_2i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %A0 = mul <2 x i32> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'mul_2i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'mul_2i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512F-LABEL: 'mul_2i32' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512BW-LABEL: 'mul_2i32' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512DQ-LABEL: 'mul_2i32' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A0 = mul <2 x i32> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-LABEL: 'mul_2i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A0 = mul <2 x i32> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'mul_2i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %A0 = mul <2 x i32> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %A0 = mul <2 x i32> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; GLM-LABEL: 'mul_2i32' -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; BTVER2-LABEL: 'mul_2i32' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %A0 = mul <2 x i32> undef, undef Index: test/Analysis/CostModel/X86/fptoui.ll =================================================================== --- test/Analysis/CostModel/X86/fptoui.ll +++ test/Analysis/CostModel/X86/fptoui.ll @@ -68,19 +68,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'fptoui_double_i32' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'fptoui_double_i32' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'fptoui_double_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'fptoui_double_i32' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 Index: test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -138,22 +138,22 @@ define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { ; AVX2-LABEL: 'test6' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKL-LABEL: 'test6' ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; KNL-LABEL: 'test6' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKX-LABEL: 'test6' ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -190,22 +190,22 @@ define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; AVX2-LABEL: 'test8' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; SKL-LABEL: 'test8' ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; KNL-LABEL: 'test8' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; SKX-LABEL: 'test8' ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; %mask = icmp eq <2 x i32> %trigger, zeroinitializer Index: test/Analysis/CostModel/X86/sitofp.ll =================================================================== --- test/Analysis/CostModel/X86/sitofp.ll +++ test/Analysis/CostModel/X86/sitofp.ll @@ -85,7 +85,7 @@ define i32 @sitofp_i32_double() { ; SSE-LABEL: 'sitofp_i32_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef Index: test/Analysis/CostModel/X86/slm-arith-costs.ll =================================================================== --- test/Analysis/CostModel/X86/slm-arith-costs.ll +++ test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -385,11 +385,11 @@ define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { ; SLM-LABEL: 'slm-costs_32_v2_mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = mul nsw <2 x i32> %a, %b +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = mul nsw <2 x i32> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; GLM-LABEL: 'slm-costs_32_v2_mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = mul nsw <2 x i32> %a, %b +; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <2 x i32> %a, %b ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; entry: Index: test/Analysis/CostModel/X86/testshiftashr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftashr.ll +++ test/Analysis/CostModel/X86/testshiftashr.ll @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 12 {{.*}} ashr + ; SSE2: cost of 16 {{.*}} ashr ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrad %0 = ashr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -320,7 +320,7 @@ define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { entry: ; SSE2: shift2i32c - ; SSE2: cost of 4 {{.*}} ashr + ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN: shift2i32c ; SSE2-CODEGEN: psrad $3 Index: test/Analysis/CostModel/X86/testshiftlshr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftlshr.ll +++ test/Analysis/CostModel/X86/testshiftlshr.ll @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 4 {{.*}} lshr + ; SSE2: cost of 16 {{.*}} lshr ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrld %0 = lshr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -322,7 +322,7 @@ ; SSE2: shift2i32c ; SSE2: cost of 1 {{.*}} lshr ; SSE2-CODEGEN: shift2i32c - ; SSE2-CODEGEN: psrlq $3 + ; SSE2-CODEGEN: psrld $3 %0 = lshr %shifttypec2i32 %a , ret %shifttypec2i32 %0 Index: test/Analysis/CostModel/X86/testshiftshl.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftshl.ll +++ test/Analysis/CostModel/X86/testshiftshl.ll @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 4 {{.*}} shl + ; SSE2: cost of 10 {{.*}} shl ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: psllq + ; SSE2-CODEGEN: pmuludq %0 = shl %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -322,7 +322,7 @@ ; SSE2: shift2i32c ; SSE2: cost of 1 {{.*}} shl ; SSE2-CODEGEN: shift2i32c - ; SSE2-CODEGEN: psllq $3 + ; SSE2-CODEGEN: pslld $3 %0 = shl %shifttypec2i32 %a , ret %shifttypec2i32 %0 Index: test/Analysis/CostModel/X86/uitofp.ll =================================================================== --- test/Analysis/CostModel/X86/uitofp.ll +++ test/Analysis/CostModel/X86/uitofp.ll @@ -85,7 +85,7 @@ define i32 @uitofp_i32_double() { ; SSE-LABEL: 'uitofp_i32_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef Index: test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll =================================================================== --- test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll +++ test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll @@ -7,7 +7,6 @@ define <2 x double> @a(<2 x i32> %x) nounwind { ; CHECK-LABEL: a: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: retl entry: @@ -19,7 +18,6 @@ ; CHECK-LABEL: b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttpd2dq %xmm0, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: retl entry: %y = fptosi <2 x double> %x to <2 x i32> Index: test/CodeGen/X86/2012-01-18-vbitcast.ll =================================================================== --- test/CodeGen/X86/2012-01-18-vbitcast.ll +++ test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -4,9 +4,8 @@ define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: vcast: ; CHECK: # %bb.0: -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: psubq %xmm1, %xmm0 +; CHECK-NEXT: movdqa (%rcx), %xmm0 +; CHECK-NEXT: psubd (%rdx), %xmm0 ; CHECK-NEXT: retq %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> Index: test/CodeGen/X86/2012-07-10-extload64.ll =================================================================== --- test/CodeGen/X86/2012-07-10-extload64.ll +++ test/CodeGen/X86/2012-07-10-extload64.ll @@ -34,7 +34,7 @@ ; CHECK-LABEL: load_64: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl BB: %t = load <2 x i32>, <2 x i32>* %ptr Index: test/CodeGen/X86/3dnow-intrinsics.ll =================================================================== --- test/CodeGen/X86/3dnow-intrinsics.ll +++ test/CodeGen/X86/3dnow-intrinsics.ll @@ -52,8 +52,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: pf2id %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -169,8 +168,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpeq %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -209,8 +207,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpge %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -249,8 +246,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpgt %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -723,8 +719,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: pf2iw %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -896,12 +891,10 @@ ; ; X64-LABEL: test_pswapdsi: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: pswapd -{{[0-9]+}}(%rsp), %mm0 # mm0 = mem[1,0] +; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: pswapd %mm0, %mm0 # mm0 = mm0[1,0] ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x i32> %a to x86_mmx Index: test/CodeGen/X86/avx2-masked-gather.ll =================================================================== --- test/CodeGen/X86/avx2-masked-gather.ll +++ test/CodeGen/X86/avx2-masked-gather.ll @@ -9,23 +9,21 @@ define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 -; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i32: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 -; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; NOGATHER-LABEL: masked_gather_v2i32: @@ -36,16 +34,14 @@ ; NOGATHER-NEXT: je .LBB0_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load ; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB0_2: # %else ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB0_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB0_4: # %else2 ; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 ; NOGATHER-NEXT: retq @@ -58,11 +54,10 @@ define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32_concat: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl @@ -70,7 +65,6 @@ ; X64-LABEL: masked_gather_v2i32_concat: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 @@ -85,18 +79,16 @@ ; NOGATHER-NEXT: je .LBB1_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load ; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB1_2: # %else ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB1_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB1_4: # %else2 -; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: %ld = load <2 x i32*>, <2 x i32*>* %ptr @@ -684,10 +676,10 @@ define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) { ; X86-LABEL: masked_gather_v2i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vpgatherdq %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; @@ -729,10 +721,10 @@ define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) { ; X86-LABEL: masked_gather_v2double: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vgatherdpd %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovapd %xmm1, %xmm0 ; X86-NEXT: retl ; Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -2584,13 +2584,31 @@ } define <2 x double> @sbto2f64(<2 x double> %a) { -; ALL-LABEL: sbto2f64: -; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; ALL-NEXT: retq +; NOVL-LABEL: sbto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpmovqd %zmm0, %ymm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VLDQ-LABEL: sbto2f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto2f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLNODQ-NEXT: retq ; ; KNL_WIDEN-LABEL: sbto2f64: ; KNL_WIDEN: # %bb.0: @@ -2997,14 +3015,20 @@ } define <2 x float> @ubto2f32(<2 x i32> %a) { -; ALL-LABEL: ubto2f32: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq ; ; KNL_WIDEN-LABEL: ubto2f32: ; KNL_WIDEN: # %bb.0: @@ -3019,15 +3043,22 @@ } define <2 x double> @ubto2f64(<2 x i32> %a) { -; ALL-LABEL: ubto2f64: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VL-NEXT: retq ; ; KNL_WIDEN-LABEL: ubto2f64: ; KNL_WIDEN: # %bb.0: Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -2553,16 +2553,16 @@ ; GENERIC-LABEL: sbto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [0:0.25] -; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] -; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x double> %a, zeroinitializer @@ -2908,19 +2908,15 @@ ; GENERIC-LABEL: ubto2f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [0:0.25] -; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> @@ -2931,20 +2927,16 @@ ; GENERIC-LABEL: ubto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [0:0.25] -; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer Index: test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -536,9 +536,7 @@ define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_16xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -547,10 +545,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -562,10 +558,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -576,10 +570,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -591,10 +583,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -605,10 +595,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -620,10 +608,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -634,10 +620,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -649,10 +633,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -258,9 +258,20 @@ } define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { -; ALL-LABEL: trunc_qd_128: -; ALL: ## %bb.0: -; ALL-NEXT: retq +; KNL-LABEL: trunc_qd_128: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qd_128: +; SKX: ## %bb.0: +; SKX-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %x = trunc <2 x i64> %i to <2 x i32> ret <2 x i32> %x } @@ -268,8 +279,10 @@ define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { ; KNL-LABEL: trunc_qd_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL-NEXT: vmovlps %xmm0, (%rdi) +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qd_128_mem: Index: test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -3194,7 +3194,10 @@ define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) { ; CHECK-LABEL: test_mm_cvtepi64_epi32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} entry: %conv.i = trunc <2 x i64> %__A to <2 x i32> Index: test/CodeGen/X86/bitcast-and-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-and-setcc-128.ll +++ test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -491,122 +491,44 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: psrad $31, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movmskpd %xmm2, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm4 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm3, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -333,75 +333,32 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -254,18 +254,17 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) { ; SKX-LABEL: test13: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test13: ; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} @@ -279,18 +278,17 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SKX-LABEL: test14: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test14: ; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} Index: test/CodeGen/X86/cvtv2f32.ll =================================================================== --- test/CodeGen/X86/cvtv2f32.ll +++ test/CodeGen/X86/cvtv2f32.ll @@ -41,11 +41,9 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_buildvector_cvt: ; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X32-NEXT: movapd {{.*#+}} xmm1 = [4503599627370496,4503599627370496] -; X32-NEXT: orpd %xmm1, %xmm2 +; X32-NEXT: movdqa {{.*#+}} xmm1 = [4503599627370496,4503599627370496] +; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X32-NEXT: por %xmm1, %xmm2 ; X32-NEXT: subpd %xmm1, %xmm2 ; X32-NEXT: cvtpd2ps %xmm2, %xmm1 ; X32-NEXT: mulps %xmm1, %xmm0 @@ -53,13 +51,13 @@ ; ; X64-LABEL: uitofp_2i32_buildvector_cvt: ; X64: # %bb.0: -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: movd %edi, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-NEXT: movdqa {{.*#+}} xmm1 = [4503599627370496,4503599627370496] -; X64-NEXT: por %xmm1, %xmm2 -; X64-NEXT: subpd %xmm1, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm1 +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: pinsrd $1, %esi, %xmm1 +; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4503599627370496,4503599627370496] +; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: subpd %xmm2, %xmm1 +; X64-NEXT: cvtpd2ps %xmm1, %xmm1 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = insertelement <2 x i32> undef, i32 %x, i32 0 @@ -72,23 +70,21 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_legalized: ; X32: # %bb.0: -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X32-NEXT: movaps {{.*#+}} xmm0 = [4503599627370496,4503599627370496] -; X32-NEXT: orps %xmm0, %xmm2 -; X32-NEXT: subpd %xmm0, %xmm2 -; X32-NEXT: cvtpd2ps %xmm2, %xmm0 +; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X32-NEXT: movdqa {{.*#+}} xmm2 = [4503599627370496,4503599627370496] +; X32-NEXT: por %xmm2, %xmm0 +; X32-NEXT: subpd %xmm2, %xmm0 +; X32-NEXT: cvtpd2ps %xmm0, %xmm0 ; X32-NEXT: mulps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: uitofp_2i32_legalized: ; X64: # %bb.0: -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X64-NEXT: movaps {{.*#+}} xmm0 = [4503599627370496,4503599627370496] -; X64-NEXT: orps %xmm0, %xmm2 -; X64-NEXT: subpd %xmm0, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm0 +; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4503599627370496,4503599627370496] +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: subpd %xmm2, %xmm0 +; X64-NEXT: cvtpd2ps %xmm0, %xmm0 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = uitofp <2 x i32> %in to <2 x float> Index: test/CodeGen/X86/i64-to-float.ll =================================================================== --- test/CodeGen/X86/i64-to-float.ll +++ test/CodeGen/X86/i64-to-float.ll @@ -16,7 +16,7 @@ ; ; X32-AVX-LABEL: mask_sitofp_2i64_2f64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-AVX-NEXT: retl ; @@ -29,7 +29,7 @@ ; ; X64-AVX-LABEL: mask_sitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq %and = and <2 x i64> %a, @@ -47,7 +47,7 @@ ; ; X32-AVX-LABEL: mask_uitofp_2i64_2f64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-AVX-NEXT: retl ; @@ -60,7 +60,7 @@ ; ; X64-AVX-LABEL: mask_uitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq %and = and <2 x i64> %a, Index: test/CodeGen/X86/insertelement-shuffle.ll =================================================================== --- test/CodeGen/X86/insertelement-shuffle.ll +++ test/CodeGen/X86/insertelement-shuffle.ll @@ -46,18 +46,10 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X32_AVX256-LABEL: insert_subvector_512: ; X32_AVX256: # %bb.0: -; X32_AVX256-NEXT: pushl %ebp -; X32_AVX256-NEXT: movl %esp, %ebp -; X32_AVX256-NEXT: andl $-8, %esp -; X32_AVX256-NEXT: subl $8, %esp -; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32_AVX256-NEXT: vmovlps %xmm2, (%esp) ; X32_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X32_AVX256-NEXT: vpinsrd $0, (%esp), %xmm2, %xmm2 +; X32_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X32_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X32_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X32_AVX256-NEXT: movl %ebp, %esp -; X32_AVX256-NEXT: popl %ebp ; X32_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -10,8 +10,12 @@ ; ; X64-LABEL: signbits_sext_v2i64_sitofp_v2f64: ; X64: # %bb.0: -; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: vmovq %rcx, %xmm0 +; X64-NEXT: vmovq %rax, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq %1 = sext i32 %a0 to i64 Index: test/CodeGen/X86/lower-bitcast.ll =================================================================== --- test/CodeGen/X86/lower-bitcast.ll +++ test/CodeGen/X86/lower-bitcast.ll @@ -9,9 +9,7 @@ define double @test1(double %A) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test1: @@ -68,9 +66,7 @@ ; CHECK-LABEL: test4: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -912,13 +912,12 @@ ; KNL_64-LABEL: test17: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovapd %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -926,36 +925,31 @@ ; KNL_32-LABEL: test17: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test17: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovapd %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test17: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovapd %xmm2, %xmm0 ; SKX_32-NEXT: retl @@ -1077,8 +1071,8 @@ ; ; KNL_32-LABEL: test20: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1096,7 +1090,6 @@ ; ; SKX_32-LABEL: test20: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} @@ -1110,9 +1103,9 @@ ; KNL_64-LABEL: test21: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1121,10 +1114,10 @@ ; ; KNL_32-LABEL: test21: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1135,7 +1128,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq ; @@ -1143,8 +1135,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) @@ -1158,7 +1148,7 @@ ; KNL_64-LABEL: test22: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 @@ -1171,7 +1161,7 @@ ; KNL_32-LABEL: test22: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1184,7 +1174,6 @@ ; ; SKX-LABEL: test22: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} @@ -1193,7 +1182,6 @@ ; ; SKX_32-LABEL: test22: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1261,28 +1249,28 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1290,10 +1278,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23: @@ -1301,10 +1287,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1315,28 +1299,28 @@ define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23b: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23b: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1344,9 +1328,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23b: @@ -1354,9 +1337,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) @@ -1366,22 +1348,22 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test24: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1389,9 +1371,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test24: @@ -1399,9 +1380,8 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1413,13 +1393,12 @@ ; KNL_64-LABEL: test25: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1427,36 +1406,31 @@ ; KNL_32-LABEL: test25: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test25: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test25: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1469,11 +1443,10 @@ ; KNL_64-LABEL: test26: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1481,32 +1454,27 @@ ; KNL_32-LABEL: test26: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movb $3, %cl ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1} ; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test26: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1} ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1519,40 +1487,40 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test27: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} +; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test27: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} +; SKX_32-NEXT: vmovaps %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind @@ -1565,7 +1533,7 @@ ; KNL_64-LABEL: test28: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1574,8 +1542,8 @@ ; ; KNL_32-LABEL: test28: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movw $3, %ax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1584,7 +1552,6 @@ ; ; SKX-LABEL: test28: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq @@ -1593,8 +1560,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movb $3, %al ; SKX_32-NEXT: kmovw %eax, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) @@ -2663,28 +2628,26 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) { ; KNL_64-LABEL: test_scatter_2i32_index: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1} +; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_2i32_index: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1} +; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -2692,19 +2655,15 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX-NEXT: vpsraq $32, %xmm1, %xmm1 -; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1} +; SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_2i32_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1} +; SKX_32-NEXT: vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1} ; SKX_32-NEXT: retl %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) Index: test/CodeGen/X86/masked_gather_scatter_widen.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter_widen.ll +++ test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -30,24 +30,21 @@ ; ; PROMOTE_SKX-LABEL: test_gather_v2i32_index: ; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; PROMOTE_SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} ; PROMOTE_SKX-NEXT: vmovapd %xmm2, %xmm0 ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_gather_v2i32_index: ; PROMOTE_KNL: # %bb.0: ; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; PROMOTE_KNL-NEXT: vpsllq $32, %xmm0, %xmm0 -; PROMOTE_KNL-NEXT: vpsraq $32, %zmm0, %zmm0 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; PROMOTE_KNL-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} ; PROMOTE_KNL-NEXT: vmovapd %xmm2, %xmm0 ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq @@ -61,11 +58,8 @@ ; ; PROMOTE_AVX2-LABEL: test_gather_v2i32_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm0, %xmm3 -; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vgatherqpd %xmm1, (%rdi,%xmm0,8), %xmm2 +; PROMOTE_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2 ; PROMOTE_AVX2-NEXT: vmovapd %xmm2, %xmm0 ; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr double, double* %base, <2 x i32> %ind @@ -97,21 +91,18 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpsllq $32, %xmm1, %xmm1 -; PROMOTE_SKX-NEXT: vpsraq $32, %xmm1, %xmm1 -; PROMOTE_SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1} +; PROMOTE_SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_scatter_v2i32_index: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; PROMOTE_KNL-NEXT: vpsllq $32, %xmm1, %xmm1 -; PROMOTE_KNL-NEXT: vpsraq $32, %zmm1, %zmm1 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1} +; PROMOTE_KNL-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq ; @@ -140,9 +131,7 @@ ; ; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm1, %xmm3 -; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] +; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpsllq $3, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 ; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 @@ -193,21 +182,20 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm1 {%k1} -; PROMOTE_SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_gather_v2i32_data: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm1 {%k1} -; PROMOTE_KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} +; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq ; @@ -221,11 +209,10 @@ ; ; PROMOTE_AVX2-LABEL: test_gather_v2i32_data: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2 -; PROMOTE_AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_AVX2-NEXT: retq %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32>%res @@ -255,16 +242,15 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; PROMOTE_SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_scatter_v2i32_data: ; PROMOTE_KNL: # %bb.0: ; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 ; PROMOTE_KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -303,7 +289,7 @@ ; PROMOTE_AVX2-NEXT: je .LBB3_4 ; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2 ; PROMOTE_AVX2-NEXT: retq call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) @@ -336,22 +322,20 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; PROMOTE_SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_gather_v2i32_data_index: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; PROMOTE_KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq ; @@ -365,12 +349,10 @@ ; ; PROMOTE_AVX2-LABEL: test_gather_v2i32_data_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; PROMOTE_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2 -; PROMOTE_AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr i32, i32* %base, <2 x i32> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) @@ -401,17 +383,15 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; PROMOTE_SKX-NEXT: vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1} ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_scatter_v2i32_data_index: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 ; PROMOTE_KNL-NEXT: vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1} @@ -443,9 +423,7 @@ ; ; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm1, %xmm3 -; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] +; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpsllq $2, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 ; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 @@ -462,7 +440,7 @@ ; PROMOTE_AVX2-NEXT: je .LBB5_4 ; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2 ; PROMOTE_AVX2-NEXT: retq %gep = getelementptr i32, i32 *%base, <2 x i32> %ind Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -483,30 +483,20 @@ } define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { -; AVX1-LABEL: test14: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: test14: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: test14: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: test14: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} @@ -515,9 +505,9 @@ ; ; SKX-LABEL: test14: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -529,41 +519,38 @@ ; AVX1-LABEL: test15: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test15: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test15: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test15: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) @@ -571,32 +558,21 @@ } define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { -; AVX1-LABEL: test16: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test16: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: test16: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: test16: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} @@ -606,9 +582,9 @@ ; ; SKX-LABEL: test16: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -620,48 +596,41 @@ ; AVX1-LABEL: test17: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test17: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test17: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test17: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} -; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) @@ -669,29 +638,19 @@ } define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { -; AVX1-LABEL: test18: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test18: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: test18: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: test18: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} @@ -701,9 +660,9 @@ ; ; SKX-LABEL: test18: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer Index: test/CodeGen/X86/mmx-arith.ll =================================================================== --- test/CodeGen/X86/mmx-arith.ll +++ test/CodeGen/X86/mmx-arith.ll @@ -202,76 +202,56 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) { ; X32-LABEL: test1: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: paddq %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: movdqa %xmm1, %xmm2 -; X32-NEXT: pmuludq %xmm0, %xmm2 -; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: paddd %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X32-NEXT: pmuludq %xmm0, %xmm1 -; X32-NEXT: psllq $32, %xmm1 -; X32-NEXT: paddq %xmm2, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: andps %xmm1, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: orps %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: xorps %xmm1, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-NEXT: pmuludq %xmm0, %xmm2 +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pand %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: por %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pxor %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%ecx) ; X32-NEXT: emms ; X32-NEXT: retl ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: paddq %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-NEXT: pmuludq %xmm0, %xmm1 -; X64-NEXT: psllq $32, %xmm1 -; X64-NEXT: paddq %xmm2, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movq %xmm1, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: pand %xmm1, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: movq %xmm1, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: pxor %xmm1, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq @@ -557,45 +537,34 @@ ; X32-LABEL: test3: ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $16, %esp -; X32-NEXT: cmpl $0, 16(%ebp) +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: je .LBB3_1 ; X32-NEXT: # %bb.2: # %bb26.preheader +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: xorl %ebx, %ebx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_3: # %bb26 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl 8(%ebp), %ecx -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl (%ecx,%ebx,8), %ecx -; X32-NEXT: movl 4(%esi,%ebx,8), %esi -; X32-NEXT: movl 12(%ebp), %edi -; X32-NEXT: addl (%edi,%ebx,8), %ecx -; X32-NEXT: adcl 4(%edi,%ebx,8), %esi -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: movl %ecx, (%esp) -; X32-NEXT: adcl %edx, %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movd %xmm0, %eax -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X32-NEXT: movd %xmm0, %edx +; X32-NEXT: movl (%edi,%ebx,8), %ebp +; X32-NEXT: movl 4(%edi,%ebx,8), %ecx +; X32-NEXT: addl (%esi,%ebx,8), %ebp +; X32-NEXT: adcl 4(%esi,%ebx,8), %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: incl %ebx -; X32-NEXT: cmpl 16(%ebp), %ebx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: jb .LBB3_3 ; X32-NEXT: jmp .LBB3_4 ; X32-NEXT: .LBB3_1: ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .LBB3_4: # %bb31 -; X32-NEXT: leal -12(%ebp), %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx Index: test/CodeGen/X86/mmx-cvt.ll =================================================================== --- test/CodeGen/X86/mmx-cvt.ll +++ test/CodeGen/X86/mmx-cvt.ll @@ -296,8 +296,8 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $32, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movq (%eax), %mm0 ; X86-NEXT: paddd %mm0, %mm0 Index: test/CodeGen/X86/mulvi32.ll =================================================================== --- test/CodeGen/X86/mulvi32.ll +++ test/CodeGen/X86/mulvi32.ll @@ -7,52 +7,39 @@ ; PR6399 define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) { -; SSE-LABEL: _mul2xi32a: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _mul2xi32a: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul2xi32a: +; SSE42: # %bb.0: +; SSE42-NEXT: pmulld %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX-LABEL: _mul2xi32a: ; AVX: # %bb.0: -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = mul <2 x i32> %0, %1 ret <2 x i32> %r } define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) { -; SSE2-LABEL: _mul2xi32b: -; SSE2: # %bb.0: -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: retq -; -; SSE42-LABEL: _mul2xi32b: -; SSE42: # %bb.0: -; SSE42-NEXT: pmuludq %xmm1, %xmm0 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: retq +; SSE-LABEL: _mul2xi32b: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: _mul2xi32b: ; AVX: # %bb.0: ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %factor0 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> %factor1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -68,7 +68,7 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind { ; SSE2-LABEL: v3i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movd %xmm2, 8(%rdi) ; SSE2-NEXT: movq %xmm0, (%rdi) @@ -76,7 +76,7 @@ ; ; SSE42-LABEL: v3i32: ; SSE42: # %bb.0: -; SSE42-NEXT: extractps $2, %xmm0, 8(%rdi) +; SSE42-NEXT: extractps $1, %xmm0, 8(%rdi) ; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE42-NEXT: movlps %xmm0, (%rdi) ; SSE42-NEXT: retq @@ -84,14 +84,14 @@ ; AVX-LABEL: v3i32: ; AVX: # %bb.0: ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi) ; AVX-NEXT: vmovlps %xmm1, (%rdi) ; AVX-NEXT: retq ; ; XOP-LABEL: v3i32: ; XOP: # %bb.0: ; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi) +; XOP-NEXT: vextractps $1, %xmm0, 8(%rdi) ; XOP-NEXT: vmovlps %xmm1, (%rdi) ; XOP-NEXT: retq %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> @@ -259,14 +259,13 @@ ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb %al, 6(%rdi) -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movd %xmm2, (%rdi) +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: retq ; @@ -276,8 +275,10 @@ ; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi) ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi) +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pextrw $2, %xmm0, 4(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movd %xmm1, (%rdi) ; SSE42-NEXT: retq ; @@ -286,17 +287,20 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi) -; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX-NEXT: vpextrw $2, %xmm2, 4(%rdi) +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovd %xmm0, (%rdi) ; AVX-NEXT: retq ; ; XOP-LABEL: v7i8: ; XOP: # %bb.0: -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,u,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15] +; XOP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi) -; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; XOP-NEXT: vpextrw $2, %xmm2, 4(%rdi) +; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovd %xmm0, (%rdi) ; XOP-NEXT: retq %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> Index: test/CodeGen/X86/pointer-vector.ll =================================================================== --- test/CodeGen/X86/pointer-vector.ll +++ test/CodeGen/X86/pointer-vector.ll @@ -117,7 +117,7 @@ ; CHECK-LABEL: BITCAST1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl entry: %G = load <2 x i8*>, <2 x i8*>* %p Index: test/CodeGen/X86/ret-mmx.ll =================================================================== --- test/CodeGen/X86/ret-mmx.ll +++ test/CodeGen/X86/ret-mmx.ll @@ -33,7 +33,7 @@ ; CHECK-LABEL: t3: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retq ret <2 x i32> } Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -1078,36 +1078,87 @@ ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psadbw %xmm3, %xmm2 -; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_2i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB3_1: # %vector.body -; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX-NEXT: addq $4, %rax -; AVX-NEXT: jne .LBB3_1 -; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_2i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB3_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: addq $4, %rax +; AVX1-NEXT: jne .LBB3_1 +; AVX1-NEXT: # %bb.2: # %middle.block +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_2i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB3_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: addq $4, %rax +; AVX2-NEXT: jne .LBB3_1 +; AVX2-NEXT: # %bb.2: # %middle.block +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_2i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB3_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX512-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: addq $4, %rax +; AVX512-NEXT: jne .LBB3_1 +; AVX512-NEXT: # %bb.2: # %middle.block +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: br label %vector.body Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -40,19 +40,29 @@ ; ; X86-AVX-LABEL: mul_2xi8: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movzbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; @@ -74,10 +84,15 @@ ; X64-AVX-LABEL: mul_2xi8: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -974,19 +989,29 @@ ; ; X86-AVX-LABEL: mul_2xi8_sext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movsbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movsbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; @@ -1010,10 +1035,15 @@ ; X64-AVX-LABEL: mul_2xi8_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movsbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1068,19 +1098,29 @@ ; ; X86-AVX-LABEL: mul_2xi8_sext_zext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; @@ -1105,10 +1145,15 @@ ; X64-AVX-LABEL: mul_2xi8_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1157,19 +1202,29 @@ ; ; X86-AVX-LABEL: mul_2xi16_sext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movswl 2(%eax,%ecx), %edx +; X86-AVX-NEXT: movswl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; @@ -1188,10 +1243,15 @@ ; X64-AVX-LABEL: mul_2xi16_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movswl 2(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1230,38 +1290,43 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: psllq $32, %xmm2 -; X86-SSE-NEXT: paddq %xmm1, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_sext_zext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; @@ -1271,28 +1336,28 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: psllq $32, %xmm2 -; X64-SSE-NEXT: paddq %xmm1, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1503,13 +1568,20 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst1: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst1: @@ -1527,12 +1599,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: movl $255, %ecx -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1571,13 +1642,20 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst2: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst2: @@ -1596,9 +1674,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1639,13 +1719,20 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst3: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst3: @@ -1666,12 +1753,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1712,13 +1798,20 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst4: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst4: @@ -1739,9 +1832,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1782,13 +1877,20 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst5: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst5: @@ -1809,9 +1911,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst5: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1852,13 +1956,20 @@ ; ; X86-AVX-LABEL: mul_2xi8_varconst6: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst6: @@ -1879,9 +1990,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst6: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1924,9 +2037,7 @@ ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1947,12 +2058,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1990,13 +2096,20 @@ ; ; X86-AVX-LABEL: mul_2xi16_varconst2: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst2: @@ -2014,9 +2127,11 @@ ; X64-AVX-LABEL: mul_2xi16_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2046,17 +2161,13 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,u,65536,u> +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 -; X86-SSE-NEXT: paddq %xmm1, %xmm3 -; X86-SSE-NEXT: psllq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: paddq %xmm3, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -2067,9 +2178,7 @@ ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -2079,15 +2188,13 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-SSE-NEXT: movq %rcx, %xmm2 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psllq $32, %xmm2 -; X64-SSE-NEXT: paddq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -2096,12 +2203,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2131,30 +2233,32 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,u,32768,u> -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 -; X86-SSE-NEXT: paddq %xmm2, %xmm3 -; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: paddq %xmm3, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst4: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst4: @@ -2163,28 +2267,24 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-SSE-NEXT: movq %rcx, %xmm1 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psllq $32, %xmm2 -; X64-SSE-NEXT: paddq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: Index: test/CodeGen/X86/shuffle-strided-with-offset-128.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -144,29 +144,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S Index: test/CodeGen/X86/shuffle-vs-trunc-128.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -247,29 +247,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S @@ -283,16 +265,35 @@ ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v2i64_to_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v2i64_to_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vmovlps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovlps %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v2i64_to_v2i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovlps %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_to_v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_to_v2i32: @@ -303,8 +304,10 @@ ; ; AVX512BW-LABEL: trunc_v2i64_to_v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: Index: test/CodeGen/X86/sse-fsignum.ll =================================================================== --- test/CodeGen/X86/sse-fsignum.ll +++ test/CodeGen/X86/sse-fsignum.ll @@ -33,19 +33,51 @@ } define void @signum64a(<2 x double>*) { -; AVX-LABEL: signum64a: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd (%rdi), %xmm0 -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovapd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: signum64a: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd (%rdi), %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovapd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: signum64a: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovapd (%rdi), %xmm0 +; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovapd %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: signum64a: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovapd (%rdi), %xmm0 +; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vmovapd %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq entry: %1 = load <2 x double>, <2 x double>* %0 %2 = fcmp olt <2 x double> %1, zeroinitializer Index: test/CodeGen/X86/trunc-ext-ld-st.ll =================================================================== --- test/CodeGen/X86/trunc-ext-ld-st.ll +++ test/CodeGen/X86/trunc-ext-ld-st.ll @@ -61,22 +61,12 @@ } define void @load_2_i32(<2 x i32>* %A) { -; SSE2-LABEL: load_2_i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: load_2_i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq +; CHECK-LABEL: load_2_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdi) +; CHECK-NEXT: retq %T = load <2 x i32>, <2 x i32>* %A %G = add <2 x i32> %T, store <2 x i32> %G, <2 x i32>* %A Index: test/CodeGen/X86/trunc-subvector.ll =================================================================== --- test/CodeGen/X86/trunc-subvector.ll +++ test/CodeGen/X86/trunc-subvector.ll @@ -40,24 +40,14 @@ define <2 x i32> @test3(<8 x i32> %v) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX2-LABEL: test3: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test3: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test3: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -67,23 +57,13 @@ define <2 x i32> @test4(<8 x i32> %v) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test4: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -93,14 +73,8 @@ define <2 x i32> @test5(<8 x i32> %v) { ; SSE2-LABEL: test5: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test5: @@ -109,7 +83,8 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,2,4,6,4,6,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -117,9 +92,11 @@ ; AVX512-LABEL: test5: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> @@ -165,23 +142,13 @@ ; SSE2-LABEL: test8: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test8: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -191,22 +158,13 @@ define <2 x i32> @test9(<8 x i32> %v) { ; SSE2-LABEL: test9: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test9: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test9: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test9: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -216,10 +174,8 @@ define <2 x i32> @test10(<8 x i32> %v) { ; SSE2-LABEL: test10: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test10: @@ -228,7 +184,8 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,2,4,6,4,6,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -236,9 +193,11 @@ ; AVX512-LABEL: test10: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> Index: test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll =================================================================== --- test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -692,17 +692,13 @@ ; CHECK-SSE2-LABEL: out_v2i32: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v2i32: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, Index: test/CodeGen/X86/vec_cast3.ll =================================================================== --- test/CodeGen/X86/vec_cast3.ll +++ test/CodeGen/X86/vec_cast3.ll @@ -5,9 +5,9 @@ define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2i8_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpsllq $56, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 ; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -23,9 +23,9 @@ define <2 x float> @cvt_v2i16_v2f32(<2 x i16> %src) { ; CHECK-LABEL: cvt_v2i16_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpsllq $48, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpslld $16, %xmm0, %xmm0 ; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -41,7 +41,6 @@ define <2 x float> @cvt_v2i32_v2f32(<2 x i32> %src) { ; CHECK-LABEL: cvt_v2i32_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -56,7 +55,7 @@ define <2 x float> @cvt_v2u8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2u8_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -72,7 +71,9 @@ define <2 x float> @cvt_v2u16_v2f32(<2 x i16> %src) { ; CHECK-LABEL: cvt_v2u16_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,xmm0[10,11],zero,zero +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -88,10 +89,9 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) { ; CHECK-LABEL: cvt_v2u32_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4503599627370496,4503599627370496] -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4503599627370496,4503599627370496] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: retl @@ -172,7 +172,6 @@ ; CHECK-LABEL: cvt_v2f32_v2i32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i32: @@ -284,34 +283,22 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: subl $68, %esp ; CHECK-NEXT: .cfi_def_cfa_offset 72 -; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vcmpltss %xmm2, %xmm1, %xmm3 -; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm3 -; CHECK-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; CHECK-NEXT: vcmpltss %xmm2, %xmm0, %xmm3 -; CHECK-NEXT: vsubss %xmm2, %xmm0, %xmm4 -; CHECK-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm3 -; CHECK-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ; CHECK-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-NEXT: fisttpll (%esp) +; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp) ; CHECK-NEXT: flds {{[0-9]+}}(%esp) ; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vucomiss %xmm2, %xmm1 -; CHECK-NEXT: setae %al -; CHECK-NEXT: shll $31, %eax -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vucomiss %xmm2, %xmm0 -; CHECK-NEXT: setae %cl -; CHECK-NEXT: shll $31, %ecx -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fisttpll (%esp) ; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $3, (%esp), %xmm0, %xmm0 ; CHECK-NEXT: addl $68, %esp ; CHECK-NEXT: retl ; Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -110,9 +110,8 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promtz: ; CHECK: # %bb.0: -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: paddd %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 @@ -129,7 +128,12 @@ ; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: psadbw %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: packuswb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) @@ -139,44 +143,44 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promlz: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 ; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: psrld $2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 +; CHECK-NEXT: psrld $4, %xmm1 ; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $8, %xmm0 +; CHECK-NEXT: psrld $8, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $16, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 ; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubb %xmm0, %xmm1 +; CHECK-NEXT: psubb %xmm0, %xmm2 ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlw $2, %xmm1 +; CHECK-NEXT: movdqa %xmm2, %xmm1 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: paddb %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm2 +; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: paddb %xmm1, %xmm2 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm2, %xmm0 -; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -186,23 +190,27 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind { ; CHECK-LABEL: prompop: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psubb %xmm1, %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 ; CHECK-NEXT: psrlw $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddb %xmm3, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psadbw %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: psadbw %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: packuswb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) Index: test/CodeGen/X86/vec_extract-mmx.ll =================================================================== --- test/CodeGen/X86/vec_extract-mmx.ll +++ test/CodeGen/X86/vec_extract-mmx.ll @@ -125,12 +125,10 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %ebp ; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $32, %esp ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -138,9 +136,7 @@ ; X64-LABEL: test4: ; X64: # %bb.0: ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq %tmp0 = bitcast x86_mmx %a to <2 x i32> %tmp1 = extractelement <2 x i32> %tmp0, i32 1 Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -115,13 +115,11 @@ ; SSE-LABEL: fptosi_2f64_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq ; ; WIDEN-LABEL: fptosi_2f64_to_2i32: @@ -416,52 +414,23 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_4i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_4i32: @@ -510,50 +479,25 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: subsd %xmm1, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm1, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_2i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; VEX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i32: @@ -602,29 +546,13 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_4f64_to_2i32: @@ -888,46 +816,20 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_4f64_to_4i32: @@ -999,13 +901,11 @@ ; SSE-LABEL: fptosi_2f32_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f32_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq ; ; WIDEN-LABEL: fptosi_2f32_to_2i32: @@ -1478,77 +1378,64 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f32_to_2i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm1, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vmovd %ecx, %xmm1 +; VEX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; VEX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; VEX-NEXT: vcvttss2si %xmm2, %rax +; VEX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VLDQ-NEXT: retq ; ; WIDEN_SKX-LABEL: fptoui_2f32_to_2i32: @@ -2673,7 +2560,8 @@ ; SSE-LABEL: fptosi_2f16_to_4i32: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rax -; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee @@ -2683,20 +2571,20 @@ ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f16_to_4i32: ; VEX: # %bb.0: ; VEX-NEXT: pushq %rax -; VEX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; VEX-NEXT: vmovaps %xmm1, %xmm0 ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee @@ -2706,27 +2594,27 @@ ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vcvttss2si (%rsp), %rax # 4-byte Folded Reload -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; VEX-NEXT: vcvttss2si %xmm0, %eax +; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vmovd %eax, %xmm1 +; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: popq %rax ; VEX-NEXT: retq ; ; AVX512-LABEL: fptosi_2f16_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm1, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2si %xmm0, %eax +; AVX512-NEXT: vcvttss2si %xmm1, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq ; ; WIDEN-LABEL: fptosi_2f16_to_4i32: @@ -2757,32 +2645,31 @@ ; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f80_to_4i32: ; AVX: # %bb.0: ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq ; ; WIDEN-LABEL: fptosi_2f80_to_4i32: @@ -2804,51 +2691,44 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; SSE-LABEL: fptosi_2f128_to_4i32: ; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: movq %rdi, %rbx -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movl %eax, %ebp ; SSE-NEXT: movq %rbx, %rdi ; SSE-NEXT: movq %r14, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f128_to_4i32: ; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: movq %rsi, %r14 -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: movl %eax, %ebp ; AVX-NEXT: movq %rbx, %rdi ; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebp, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %rbp ; AVX-NEXT: retq ; ; WIDEN-LABEL: fptosi_2f128_to_4i32: Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -17,11 +17,9 @@ ; ; X64-LABEL: t1: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $12, %edi -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq %tmp12 = shl i32 %a, 12 Index: test/CodeGen/X86/vec_insert-7.ll =================================================================== --- test/CodeGen/X86/vec_insert-7.ll +++ test/CodeGen/X86/vec_insert-7.ll @@ -8,18 +8,22 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X32-LABEL: mmx_movzl: ; X32: ## %bb.0: -; X32-NEXT: subl $20, %esp +; X32-NEXT: subl $44, %esp +; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) +; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: movl $32, %eax -; X32-NEXT: movd %eax, %xmm0 -; X32-NEXT: movq %xmm0, (%esp) +; X32-NEXT: pinsrd $0, %eax, %xmm0 +; X32-NEXT: pxor %xmm1, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; X32-NEXT: movdqa %xmm1, (%esp) ; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: addl $20, %esp +; X32-NEXT: addl $44, %esp ; X32-NEXT: retl ; ; X64-LABEL: mmx_movzl: ; X64: ## %bb.0: ; X64-NEXT: movl $32, %eax -; X64-NEXT: movq %rax, %xmm0 +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: retq %tmp = bitcast x86_mmx %x to <2 x i32> %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -13,10 +13,8 @@ ; ; X64-LABEL: t0: ; X64: ## %bb.0: -; X64-NEXT: ## kill: def $edi killed $edi def $rdi -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X64-NEXT: retq %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -3192,15 +3192,19 @@ ; ; SSE41-LABEL: sitofp_load_2i16_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movswl 2(%rdi), %eax +; SSE41-NEXT: movswl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_2i16_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movswl 2(%rdi), %eax +; AVX-NEXT: movswl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i16>, <2 x i16> *%a @@ -3221,15 +3225,19 @@ ; ; SSE41-LABEL: sitofp_load_2i8_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movsbl 1(%rdi), %eax +; SSE41-NEXT: movsbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a @@ -3633,15 +3641,19 @@ ; ; SSE41-LABEL: uitofp_load_2i8_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a Index: test/CodeGen/X86/vec_zero_cse.ll =================================================================== --- test/CodeGen/X86/vec_zero_cse.ll +++ test/CodeGen/X86/vec_zero_cse.ll @@ -22,7 +22,8 @@ ; X64-LABEL: test1: ; X64: # %bb.0: ; X64-NEXT: movq $0, {{.*}}(%rip) -; X64-NEXT: movq $0, {{.*}}(%rip) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movlps %xmm0, {{.*}}(%rip) ; X64-NEXT: retq store <1 x i64> zeroinitializer, <1 x i64>* @M1 store <2 x i32> zeroinitializer, <2 x i32>* @M2 @@ -41,8 +42,8 @@ ; X64-LABEL: test2: ; X64: # %bb.0: ; X64-NEXT: movq $-1, {{.*}}(%rip) -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq %rax, {{.*}}(%rip) +; X64-NEXT: pcmpeqd %xmm0, %xmm0 +; X64-NEXT: movq %xmm0, {{.*}}(%rip) ; X64-NEXT: retq store <1 x i64> < i64 -1 >, <1 x i64>* @M1 store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2 Index: test/CodeGen/X86/vector-idiv-v2i32.ll =================================================================== --- test/CodeGen/X86/vector-idiv-v2i32.ll +++ test/CodeGen/X86/vector-idiv-v2i32.ll @@ -8,58 +8,56 @@ ; X64-LABEL: test_udiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 ; X64-NEXT: shrq $32, %rcx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: shrl %eax ; X64-NEXT: addl %ecx, %eax ; X64-NEXT: shrl $2, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: subl %edx, %ecx -; X64-NEXT: shrl %ecx -; X64-NEXT: addl %edx, %ecx -; X64-NEXT: shrl $2, %ecx -; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: shrl $2, %eax +; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_udiv7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X86-NEXT: movd %xmm0, %esi -; X86-NEXT: movl $613566757, %ebx # imm = 0x24924925 -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: subl %edx, %esi -; X86-NEXT: shrl %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: shrl $2, %esi +; X86-NEXT: movl $613566757, %edi # imm = 0x24924925 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: shrl $2, %ecx +; X86-NEXT: movd %ecx, %xmm1 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-NEXT: movd %xmm0, %ecx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %edi ; X86-NEXT: subl %edx, %ecx ; X86-NEXT: shrl %ecx ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: shrl $2, %ecx ; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_udiv7_v2i32: @@ -88,31 +86,33 @@ ; ; X86_WIDEN-LABEL: test_udiv7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %ecx -; X86_WIDEN-NEXT: movl 4(%eax), %esi -; X86_WIDEN-NEXT: movl $613566757, %ebx # imm = 0x24924925 +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: movl $613566757, %edi # imm = 0x24924925 ; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %ebx +; X86_WIDEN-NEXT: mull %edi ; X86_WIDEN-NEXT: subl %edx, %ecx ; X86_WIDEN-NEXT: shrl %ecx ; X86_WIDEN-NEXT: addl %edx, %ecx ; X86_WIDEN-NEXT: shrl $2, %ecx -; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: mull %ebx -; X86_WIDEN-NEXT: subl %edx, %esi -; X86_WIDEN-NEXT: shrl %esi -; X86_WIDEN-NEXT: addl %edx, %esi -; X86_WIDEN-NEXT: shrl $2, %esi -; X86_WIDEN-NEXT: movl %esi, 4(%edi) -; X86_WIDEN-NEXT: movl %ecx, (%edi) +; X86_WIDEN-NEXT: movd %ecx, %xmm1 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: mull %edi +; X86_WIDEN-NEXT: subl %edx, %ecx +; X86_WIDEN-NEXT: shrl %ecx +; X86_WIDEN-NEXT: addl %edx, %ecx +; X86_WIDEN-NEXT: shrl $2, %ecx +; X86_WIDEN-NEXT: movd %ecx, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, @@ -124,58 +124,57 @@ ; X64-LABEL: test_urem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: subl %edx, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: addl %edx, %eax -; X64-NEXT: shrl $2, %eax -; X64-NEXT: leal (,%rax,8), %edx -; X64-NEXT: subl %edx, %eax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movl %ecx, %edi -; X64-NEXT: subl %edx, %edi -; X64-NEXT: shrl %edi -; X64-NEXT: addl %edx, %edi -; X64-NEXT: shrl $2, %edi -; X64-NEXT: leal (,%rdi,8), %edx -; X64-NEXT: subl %edx, %edi -; X64-NEXT: addl %ecx, %edi -; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: subl %ecx, %edx +; X64-NEXT: shrl %edx +; X64-NEXT: addl %ecx, %edx +; X64-NEXT: shrl $2, %edx +; X64-NEXT: leal (,%rdx,8), %ecx +; X64-NEXT: subl %ecx, %edx +; X64-NEXT: addl %eax, %edx +; X64-NEXT: movd %edx, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: subl %ecx, %edx +; X64-NEXT: shrl %edx +; X64-NEXT: addl %ecx, %edx +; X64-NEXT: shrl $2, %edx +; X64-NEXT: leal (,%rdx,8), %ecx +; X64-NEXT: subl %ecx, %edx +; X64-NEXT: addl %eax, %edx +; X64-NEXT: movd %edx, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_urem7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X86-NEXT: movd %xmm0, %esi ; X86-NEXT: movl $613566757, %edi # imm = 0x24924925 -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: subl %edx, %ebx -; X86-NEXT: shrl %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: shrl $2, %ebx -; X86-NEXT: leal (,%ebx,8), %eax -; X86-NEXT: subl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: subl %edx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (,%eax,8), %edx +; X86-NEXT: subl %edx, %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-NEXT: movd %xmm0, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %ecx, %eax @@ -187,13 +186,10 @@ ; X86-NEXT: subl %edx, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %ebx, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%ebp) +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_urem7_v2i32: @@ -230,27 +226,15 @@ ; ; X86_WIDEN-LABEL: test_urem7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebp -; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %esi -; X86_WIDEN-NEXT: movl 4(%eax), %ecx -; X86_WIDEN-NEXT: movl $613566757, %ebx # imm = 0x24924925 -; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: mull %ebx -; X86_WIDEN-NEXT: movl %esi, %ebp -; X86_WIDEN-NEXT: subl %edx, %ebp -; X86_WIDEN-NEXT: shrl %ebp -; X86_WIDEN-NEXT: addl %edx, %ebp -; X86_WIDEN-NEXT: shrl $2, %ebp -; X86_WIDEN-NEXT: leal (,%ebp,8), %eax -; X86_WIDEN-NEXT: subl %eax, %ebp -; X86_WIDEN-NEXT: addl %esi, %ebp +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: movl $613566757, %edi # imm = 0x24924925 ; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %ebx +; X86_WIDEN-NEXT: mull %edi ; X86_WIDEN-NEXT: movl %ecx, %eax ; X86_WIDEN-NEXT: subl %edx, %eax ; X86_WIDEN-NEXT: shrl %eax @@ -259,12 +243,24 @@ ; X86_WIDEN-NEXT: leal (,%eax,8), %edx ; X86_WIDEN-NEXT: subl %edx, %eax ; X86_WIDEN-NEXT: addl %ecx, %eax -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %ebp, (%edi) +; X86_WIDEN-NEXT: movd %eax, %xmm1 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: mull %edi +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: subl %edx, %eax +; X86_WIDEN-NEXT: shrl %eax +; X86_WIDEN-NEXT: addl %edx, %eax +; X86_WIDEN-NEXT: shrl $2, %eax +; X86_WIDEN-NEXT: leal (,%eax,8), %edx +; X86_WIDEN-NEXT: subl %edx, %eax +; X86_WIDEN-NEXT: addl %ecx, %eax +; X86_WIDEN-NEXT: movd %eax, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx -; X86_WIDEN-NEXT: popl %ebp ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = urem <2 x i32> %a, @@ -276,8 +272,7 @@ ; X64-LABEL: test_sdiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: cltq ; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 ; X64-NEXT: shrq $32, %rcx @@ -286,57 +281,55 @@ ; X64-NEXT: shrl $31, %ecx ; X64-NEXT: sarl $2, %eax ; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: addl %edx, %ecx -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: shrl $31, %edx -; X64-NEXT: sarl $2, %ecx -; X64-NEXT: addl %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cltq +; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: shrl $31, %ecx +; X64-NEXT: sarl $2, %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-NEXT: movd %xmm0, %esi -; X86-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 +; X86-NEXT: movl $-1840700269, %ebx # imm = 0x92492493 ; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull %ebx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: shrl $31, %eax -; X86-NEXT: sarl $2, %edi -; X86-NEXT: addl %eax, %edi +; X86-NEXT: sarl $2, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movd %edx, %xmm0 ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: imull %ebp +; X86-NEXT: imull %ebx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl %edx, %eax ; X86-NEXT: shrl $31, %eax ; X86-NEXT: sarl $2, %edx ; X86-NEXT: addl %eax, %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %edi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%ebx) +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%edi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_sdiv7_v2i32: @@ -369,36 +362,37 @@ ; ; X86_WIDEN-LABEL: test_sdiv7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebp ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %ecx -; X86_WIDEN-NEXT: movl 4(%eax), %esi -; X86_WIDEN-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: imull %ebp -; X86_WIDEN-NEXT: movl %edx, %edi -; X86_WIDEN-NEXT: addl %ecx, %edi -; X86_WIDEN-NEXT: movl %edi, %eax -; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %edi -; X86_WIDEN-NEXT: addl %eax, %edi +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %esi +; X86_WIDEN-NEXT: movl $-1840700269, %ebx # imm = 0x92492493 ; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: imull %ebp +; X86_WIDEN-NEXT: imull %ebx ; X86_WIDEN-NEXT: addl %esi, %edx ; X86_WIDEN-NEXT: movl %edx, %eax ; X86_WIDEN-NEXT: shrl $31, %eax ; X86_WIDEN-NEXT: sarl $2, %edx ; X86_WIDEN-NEXT: addl %eax, %edx -; X86_WIDEN-NEXT: movl %edx, 4(%ebx) -; X86_WIDEN-NEXT: movl %edi, (%ebx) +; X86_WIDEN-NEXT: movd %edx, %xmm0 +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: imull %ebx +; X86_WIDEN-NEXT: addl %ecx, %edx +; X86_WIDEN-NEXT: movl %edx, %eax +; X86_WIDEN-NEXT: shrl $31, %eax +; X86_WIDEN-NEXT: sarl $2, %edx +; X86_WIDEN-NEXT: addl %eax, %edx +; X86_WIDEN-NEXT: movd %edx, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%edi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx -; X86_WIDEN-NEXT: popl %ebp ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, @@ -410,63 +404,62 @@ ; X64-LABEL: test_srem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: movslq %eax, %rcx -; X64-NEXT: imulq $-1840700269, %rcx, %rax # imm = 0x92492493 -; X64-NEXT: shrq $32, %rax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movl %eax, %edx +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cltq +; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: shrl $31, %edx -; X64-NEXT: sarl $2, %eax -; X64-NEXT: addl %edx, %eax -; X64-NEXT: leal (,%rax,8), %edx -; X64-NEXT: subl %edx, %eax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: addl %ecx, %edx -; X64-NEXT: movl %edx, %edi -; X64-NEXT: shrl $31, %edi -; X64-NEXT: sarl $2, %edx -; X64-NEXT: addl %edi, %edx -; X64-NEXT: leal (,%rdx,8), %edi -; X64-NEXT: subl %edi, %edx -; X64-NEXT: addl %ecx, %edx -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: sarl $2, %ecx +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: leal (,%rcx,8), %edx +; X64-NEXT: subl %edx, %ecx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: movd %ecx, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cltq +; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $2, %ecx +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: leal (,%rcx,8), %edx +; X64-NEXT: subl %edx, %ecx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_srem7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-NEXT: movd %xmm0, %esi -; X86-NEXT: movl $-1840700269, %ebx # imm = 0x92492493 +; X86-NEXT: movl $-1840700269, %edi # imm = 0x92492493 ; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull %edi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: shrl $31, %eax -; X86-NEXT: sarl $2, %edi -; X86-NEXT: addl %eax, %edi -; X86-NEXT: leal (,%edi,8), %eax -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: addl %esi, %edi +; X86-NEXT: sarl $2, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (,%edx,8), %eax +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movd %edx, %xmm0 ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: imull %ebx +; X86-NEXT: imull %edi ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl %edx, %eax ; X86-NEXT: shrl $31, %eax @@ -475,14 +468,12 @@ ; X86-NEXT: leal (,%edx,8), %eax ; X86-NEXT: subl %eax, %edx ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %edi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%ebp) +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_srem7_v2i32: @@ -521,28 +512,29 @@ ; ; X86_WIDEN-LABEL: test_srem7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebp ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %edi -; X86_WIDEN-NEXT: movl 4(%eax), %ecx -; X86_WIDEN-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 -; X86_WIDEN-NEXT: movl %edi, %eax -; X86_WIDEN-NEXT: imull %ebp -; X86_WIDEN-NEXT: movl %edx, %esi -; X86_WIDEN-NEXT: addl %edi, %esi +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %esi +; X86_WIDEN-NEXT: movl $-1840700269, %edi # imm = 0x92492493 ; X86_WIDEN-NEXT: movl %esi, %eax +; X86_WIDEN-NEXT: imull %edi +; X86_WIDEN-NEXT: addl %esi, %edx +; X86_WIDEN-NEXT: movl %edx, %eax ; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %esi -; X86_WIDEN-NEXT: addl %eax, %esi -; X86_WIDEN-NEXT: leal (,%esi,8), %eax -; X86_WIDEN-NEXT: subl %eax, %esi -; X86_WIDEN-NEXT: addl %edi, %esi +; X86_WIDEN-NEXT: sarl $2, %edx +; X86_WIDEN-NEXT: addl %eax, %edx +; X86_WIDEN-NEXT: leal (,%edx,8), %eax +; X86_WIDEN-NEXT: subl %eax, %edx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: addl %esi, %edx +; X86_WIDEN-NEXT: movd %edx, %xmm0 ; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: imull %ebp +; X86_WIDEN-NEXT: imull %edi ; X86_WIDEN-NEXT: addl %ecx, %edx ; X86_WIDEN-NEXT: movl %edx, %eax ; X86_WIDEN-NEXT: shrl $31, %eax @@ -551,12 +543,12 @@ ; X86_WIDEN-NEXT: leal (,%edx,8), %eax ; X86_WIDEN-NEXT: subl %eax, %edx ; X86_WIDEN-NEXT: addl %ecx, %edx -; X86_WIDEN-NEXT: movl %edx, 4(%ebx) -; X86_WIDEN-NEXT: movl %esi, (%ebx) +; X86_WIDEN-NEXT: movd %edx, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%ebx) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx -; X86_WIDEN-NEXT: popl %ebp ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a, @@ -568,10 +560,7 @@ ; X64-LABEL: test_udiv_pow2_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: psrlq $3, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: psrld $3, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq ; @@ -580,10 +569,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: psrlq $3, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: psrld $3, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; @@ -600,9 +586,7 @@ ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86_WIDEN-NEXT: psrld $3, %xmm0 -; X86_WIDEN-NEXT: movd %xmm0, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movq %xmm0, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, @@ -613,14 +597,9 @@ define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X64-LABEL: test_urem_pow2_v2i32: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl 4(%rdi), %ecx -; X64-NEXT: movq %rcx, %xmm0 -; X64-NEXT: movq %rax, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-NEXT: pand {{.*}}(%rip), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: andps {{.*}}(%rip), %xmm0 +; X64-NEXT: movlps %xmm0, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_urem_pow2_v2i32: @@ -628,10 +607,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-NEXT: andps {{\.LCPI.*}}, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_urem_pow2_v2i32: @@ -645,11 +622,9 @@ ; X86_WIDEN: # %bb.0: ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86_WIDEN-NEXT: movd %xmm0, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: andps {{\.LCPI.*}}, %xmm0 +; X86_WIDEN-NEXT: movlps %xmm0, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = urem <2 x i32> %a, @@ -661,24 +636,12 @@ ; X64-LABEL: test_sdiv_pow2_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psrad $31, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: psrlq $31, %xmm0 -; X64-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-NEXT: psrlq $29, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X64-NEXT: psrad $31, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: psrlq $3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: psrld $29, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: psrad $3, %xmm1 +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv_pow2_v2i32: @@ -686,38 +649,12 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-NEXT: psrad $31, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-NEXT: movdqa {{.*#+}} xmm3 = [31,0,31,0] -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: psrlq %xmm3, %xmm4 -; X86-NEXT: movl $31, %ecx -; X86-NEXT: movd %ecx, %xmm5 -; X86-NEXT: psrlq %xmm5, %xmm2 -; X86-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; X86-NEXT: movdqa %xmm1, %xmm4 -; X86-NEXT: psrlq %xmm3, %xmm4 -; X86-NEXT: psrlq %xmm5, %xmm1 -; X86-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; X86-NEXT: xorpd %xmm2, %xmm1 -; X86-NEXT: psubq %xmm2, %xmm1 -; X86-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X86-NEXT: psrlq $29, %xmm1 -; X86-NEXT: paddq %xmm0, %xmm1 -; X86-NEXT: psllq $32, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; X86-NEXT: psrad $31, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: psrlq $3, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: psrld $29, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: psrad $3, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_sdiv_pow2_v2i32: @@ -741,9 +678,7 @@ ; X86_WIDEN-NEXT: psrld $29, %xmm1 ; X86_WIDEN-NEXT: paddd %xmm0, %xmm1 ; X86_WIDEN-NEXT: psrad $3, %xmm1 -; X86_WIDEN-NEXT: movd %xmm1, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movq %xmm1, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, @@ -755,10 +690,7 @@ ; X64-LABEL: test_srem_pow2_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: psrlq $3, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: psrld $3, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq ; @@ -767,10 +699,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: psrlq $3, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: psrld $3, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; @@ -787,9 +716,7 @@ ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86_WIDEN-NEXT: psrld $3, %xmm0 -; X86_WIDEN-NEXT: movd %xmm0, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movq %xmm0, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, @@ -803,52 +730,45 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %esi -; X64-NEXT: movl %eax, %esi +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %edi +; X64-NEXT: divl %esi ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_udiv_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: divl %ebx +; X86-NEXT: divl %esi ; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-NEXT: movq %xmm2, (%ecx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_udiv_v2i32: @@ -874,25 +794,27 @@ ; ; X86_WIDEN-LABEL: test_udiv_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx -; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: divl (%ebx) -; X86_WIDEN-NEXT: movl %eax, %esi +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %eax, %xmm2 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: divl 4(%ebx) -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %eax, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm2, (%ecx) ; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = load <2 x i32>, <2 x i32>* %y @@ -907,52 +829,45 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %esi -; X64-NEXT: movl %edx, %esi +; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %edi +; X64-NEXT: divl %esi ; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_urem_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movd %edx, %xmm2 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: divl %ebx +; X86-NEXT: divl %esi ; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-NEXT: movq %xmm2, (%ecx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_urem_v2i32: @@ -978,25 +893,27 @@ ; ; X86_WIDEN-LABEL: test_urem_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx -; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: divl (%ebx) -; X86_WIDEN-NEXT: movl %edx, %esi +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %edx, %xmm2 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: divl 4(%ebx) -; X86_WIDEN-NEXT: movl %edx, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %edx, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm2, (%ecx) ; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = load <2 x i32>, <2 x i32>* %y @@ -1011,21 +928,20 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: cltd ; X64-NEXT: idivl %esi -; X64-NEXT: movl %eax, %esi +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: cltd -; X64-NEXT: idivl %edi +; X64-NEXT: idivl %esi ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv_v2i32: @@ -1033,27 +949,26 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: movd %xmm1, %edi +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] -; X86-NEXT: movd %xmm1, %esi -; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd ; X86-NEXT: idivl %ebx ; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cltd +; X86-NEXT: idivl %edi +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1085,19 +1000,26 @@ ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %edi +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %ebx ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl (%ebx) -; X86_WIDEN-NEXT: movl %eax, %esi +; X86_WIDEN-NEXT: idivl %ebx +; X86_WIDEN-NEXT: movd %eax, %xmm0 ; X86_WIDEN-NEXT: movl %ecx, %eax ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl 4(%ebx) -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: idivl %edi +; X86_WIDEN-NEXT: movd %eax, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx @@ -1115,21 +1037,20 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: cltd ; X64-NEXT: idivl %esi -; X64-NEXT: movl %eax, %esi +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: cltd -; X64-NEXT: idivl %edi +; X64-NEXT: idivl %esi ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_srem_v2i32: @@ -1137,27 +1058,26 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: movd %xmm1, %edi +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] -; X86-NEXT: movd %xmm1, %esi -; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd ; X86-NEXT: idivl %ebx ; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cltd +; X86-NEXT: idivl %edi +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1189,19 +1109,26 @@ ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %edi +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %ebx ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl (%ebx) -; X86_WIDEN-NEXT: movl %eax, %esi +; X86_WIDEN-NEXT: idivl %ebx +; X86_WIDEN-NEXT: movd %eax, %xmm0 ; X86_WIDEN-NEXT: movl %ecx, %eax ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl 4(%ebx) -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: idivl %edi +; X86_WIDEN-NEXT: movd %eax, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -5056,8 +5056,7 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_2i8_to_2i32: @@ -5066,27 +5065,35 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movsbl 1(%rdi), %eax +; SSE41-NEXT: movsbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X32-SSE41-LABEL: sext_2i8_to_2i32: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 -; X32-SSE41-NEXT: paddq %xmm0, %xmm0 +; X32-SSE41-NEXT: movsbl 1(%eax), %ecx +; X32-SSE41-NEXT: movsbl (%eax), %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: paddd %xmm0, %xmm0 ; X32-SSE41-NEXT: retl %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = sext <2 x i8> %x to <2 x i32> Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -1617,15 +1617,71 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; -; AVX-LABEL: trunc2x2i64_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: retq +; AVX1-LABEL: trunc2x2i64_4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: retq ; -; AVX512-LABEL: trunc2x2i64_4i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512-NEXT: retq +; AVX2-SLOW-LABEL: trunc2x2i64_4i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x2i64_4i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2x2i64_4i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x2i64_4i32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x2i64_4i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x2i64_4i32: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq entry: %0 = trunc <2 x i64> %a to <2 x i32> %1 = trunc <2 x i64> %b to <2 x i32> @@ -1640,34 +1696,59 @@ ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; -; AVX-LABEL: trunc2i64_i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: trunc2i64_i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2i64_i64: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovq %xmm0, %rax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2i64_i64: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovq %xmm0, %rax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc2i64_i64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc2i64_i64: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc2i64_i64: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2i64_i64: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) -; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <2 x i64> %inval to <2 x i32> Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -2267,28 +2267,35 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_2i8_to_2i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[3],zero,zero,zero -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: zext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = zext <2 x i8> %x to <2 x i32> Index: test/CodeGen/X86/vshift-4.ll =================================================================== --- test/CodeGen/X86/vshift-4.ll +++ test/CodeGen/X86/vshift-4.ll @@ -58,7 +58,7 @@ ; X32-LABEL: shift2a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -67,7 +67,7 @@ ; ; X64-LABEL: shift2a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -84,7 +84,7 @@ ; X32-LABEL: shift2b: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -93,7 +93,7 @@ ; ; X64-LABEL: shift2b: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -110,7 +110,7 @@ ; X32-LABEL: shift2c: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -119,7 +119,7 @@ ; ; X64-LABEL: shift2c: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 Index: test/CodeGen/X86/widen_arith-3.ll =================================================================== --- test/CodeGen/X86/widen_arith-3.ll +++ test/CodeGen/X86/widen_arith-3.ll @@ -12,9 +12,8 @@ ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $40, %esp +; CHECK-NEXT: subl $24, %esp ; CHECK-NEXT: movl {{\.LCPI.*}}, %eax -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -26,13 +25,13 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl 12(%ebp), %edx ; CHECK-NEXT: movl 8(%ebp), %ecx -; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm2 -; CHECK-NEXT: psubd %xmm0, %xmm2 -; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8) -; CHECK-NEXT: pshufb %xmm1, %xmm2 -; CHECK-NEXT: movd %xmm2, (%ecx,%eax,8) +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: pextrw $4, %xmm1, 4(%ecx,%eax,8) +; CHECK-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; CHECK-NEXT: movd %xmm1, (%ecx,%eax,8) ; CHECK-NEXT: incl {{[0-9]+}}(%esp) ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 Index: test/CodeGen/X86/widen_cast-5.ll =================================================================== --- test/CodeGen/X86/widen_cast-5.ll +++ test/CodeGen/X86/widen_cast-5.ll @@ -8,18 +8,15 @@ ; X86-LABEL: convert: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; X86-NEXT: pxor LCPI0_0, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: xorps LCPI0_0, %xmm0 +; X86-NEXT: movlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movq %rsi, %xmm0 -; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: pxor {{.*}}(%rip), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/widen_conv-1.ll =================================================================== --- test/CodeGen/X86/widen_conv-1.ll +++ test/CodeGen/X86/widen_conv-1.ll @@ -8,16 +8,17 @@ ; X86-LABEL: convert_v2i64_to_v2i32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i64_to_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: psubd %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -66,20 +67,14 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind { ; X86-LABEL: convert_v5i16_to_v5i8: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqa (%ecx), %xmm0 ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubw %xmm1, %xmm0 ; X86-NEXT: pextrb $8, %xmm0, 4(%eax) -; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; X86-NEXT: movd %xmm0, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: convert_v5i16_to_v5i8: @@ -88,7 +83,7 @@ ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: psubw %xmm1, %xmm0 ; X64-NEXT: pextrb $8, %xmm0, 4(%rdi) -; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; X64-NEXT: movd %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/widen_conv-2.ll =================================================================== --- test/CodeGen/X86/widen_conv-2.ll +++ test/CodeGen/X86/widen_conv-2.ll @@ -7,18 +7,18 @@ define void @convert_v2i16_v2i32(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind { ; X86-LABEL: convert_v2i16_v2i32: ; X86: # %bb.0: # %entry +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: psllq $48, %xmm0 +; X86-NEXT: pslld $16, %xmm0 ; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i16_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/widen_conv-3.ll =================================================================== --- test/CodeGen/X86/widen_conv-3.ll +++ test/CodeGen/X86/widen_conv-3.ll @@ -9,19 +9,19 @@ define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind { ; X86-LABEL: convert_v2i16_to_v2f32: ; X86: # %bb.0: # %entry +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: psllq $48, %xmm0 +; X86-NEXT: pslld $16, %xmm0 ; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-NEXT: movlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i16_to_v2f32: ; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-NEXT: movlps %xmm0, (%rdi) ; X64-NEXT: retq Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -151,7 +151,7 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %edx @@ -163,8 +163,8 @@ ; X86-NEXT: pinsrd $2, 4(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 ; X86-NEXT: pextrw $4, %xmm1, 4(%eax) -; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X86-NEXT: movd %xmm1, (%eax) +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 @@ -176,8 +176,8 @@ ; X64-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-NEXT: paddd %xmm0, %xmm1 ; X64-NEXT: pextrw $4, %xmm1, 4(%rdi) -; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-NEXT: movd %xmm1, (%rdi) +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: movd %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i16vec3, %i16vec3* %ap, align 16 %b = load %i16vec3, %i16vec3* %bp, align 16 Index: test/CodeGen/X86/widened-broadcast.ll =================================================================== --- test/CodeGen/X86/widened-broadcast.ll +++ test/CodeGen/X86/widened-broadcast.ll @@ -597,22 +597,10 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_8i32_2i32_0101: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8i32_2i32_0101: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8i32_2i32_0101: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8i32_2i32_0101: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -630,9 +618,7 @@ ; ; AVX1-LABEL: load_splat_16i32_2i32_0101: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX1-NEXT: vmovaps %ymm0, %ymm1 ; AVX1-NEXT: retq ; @@ -644,9 +630,7 @@ ; ; AVX512-LABEL: load_splat_16i32_2i32_0101: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 ; AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> Index: test/CodeGen/X86/x86-shifts.ll =================================================================== --- test/CodeGen/X86/x86-shifts.ll +++ test/CodeGen/X86/x86-shifts.ll @@ -254,16 +254,16 @@ ; X32-LABEL: shl2_other: ; X32: # %bb.0: # %entry ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psllq $2, %xmm1 -; X32-NEXT: psllq $9, %xmm0 +; X32-NEXT: pslld $2, %xmm1 +; X32-NEXT: pslld $9, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shl2_other: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psllq $2, %xmm1 -; X64-NEXT: psllq $9, %xmm0 +; X64-NEXT: pslld $2, %xmm1 +; X64-NEXT: pslld $9, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: @@ -276,19 +276,17 @@ define <2 x i32> @shr2_other(<2 x i32> %A) nounwind { ; X32-LABEL: shr2_other: ; X32: # %bb.0: # %entry -; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psrlq $8, %xmm1 -; X32-NEXT: psrlq $1, %xmm0 +; X32-NEXT: psrld $8, %xmm1 +; X32-NEXT: psrld $1, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shr2_other: ; X64: # %bb.0: # %entry -; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $8, %xmm1 -; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: psrld $8, %xmm1 +; X64-NEXT: psrld $1, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -342,18 +342,30 @@ ; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 ; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0 -; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0 -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]] -; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 -; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2 -; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] +; ZEROTHRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] +; ZEROTHRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 +; ZEROTHRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 +; ZEROTHRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 +; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 +; ZEROTHRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 +; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 ; ZEROTHRESH-NEXT: ret <4 x float> [[RD]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -430,18 +442,12 @@ ; CHECK-NEXT: ret <2 x float> [[RB]] ; ; ZEROTHRESH-LABEL: @simple_select_v2( -; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1 -; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1 -; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]] +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1 ; ZEROTHRESH-NEXT: ret <2 x float> [[RB]] ; %c0 = extractelement <2 x i32> %c, i32 0