Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -747,6 +747,7 @@ SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); + SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3351,6 +3351,7 @@ case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; + case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break; case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break; @@ -3602,36 +3603,85 @@ false, MST->isCompressingStore()); } -SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { - assert(OpNo == 1 && "Can widen only data operand of mscatter"); - MaskedScatterSDNode *MSC = cast(N); - SDValue DataOp = MSC->getValue(); - SDValue Mask = MSC->getMask(); +SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) { + assert(OpNo == 4 && "Can widen only the index of mgather"); + auto *MG = cast(N); + SDValue DataOp = MG->getValue(); + SDValue Mask = MG->getMask(); + SDValue Scale = MG->getScale(); EVT MaskVT = Mask.getValueType(); - SDValue Scale = MSC->getScale(); + EVT DataVT = DataOp.getValueType(); + + // Widen index. + SDValue Index = GetWidenedVector(MG->getIndex()); + unsigned NumElts = Index.getValueType().getVectorNumElements(); // Widen the value. - SDValue WideVal = GetWidenedVector(DataOp); - EVT WideVT = WideVal.getValueType(); - unsigned NumElts = WideVT.getVectorNumElements(); - SDLoc dl(N); + EVT WideDataVT = EVT::getVectorVT(*DAG.getContext(), + DataVT.getVectorElementType(), + NumElts); + DataOp = ModifyToType(DataOp, WideDataVT); // The mask should be widened as well. EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MaskVT.getVectorElementType(), NumElts); Mask = ModifyToType(Mask, WideMaskVT, true); - // Widen index. + SDLoc dl(N); + SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index, + Scale}; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideDataVT, MVT::Other), + MG->getMemoryVT(), dl, Ops, + MG->getMemOperand()); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, N->getValueType(0), Res, + DAG.getConstant(0, dl, + TLI.getVectorIdxTy(DAG.getDataLayout()))); + + ReplaceValueWith(SDValue(N, 0), Res.getValue(0)); + return SDValue(); +} + +SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { + MaskedScatterSDNode *MSC = cast(N); + SDValue DataOp = MSC->getValue(); + SDValue Mask = MSC->getMask(); SDValue Index = MSC->getIndex(); - EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), - Index.getValueType().getScalarType(), - NumElts); - Index = ModifyToType(Index, WideIndexVT); + SDValue Scale = MSC->getScale(); + + unsigned NumElts; + if (OpNo == 1) { + DataOp = GetWidenedVector(DataOp); + NumElts = DataOp.getValueType().getVectorNumElements(); + + // Widen index. + EVT IndexVT = Index.getValueType(); + EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), + IndexVT.getVectorElementType(), NumElts); + Index = ModifyToType(Index, WideIndexVT); + } else if (OpNo == 4) { + Index = GetWidenedVector(Index); + NumElts = Index.getValueType().getVectorNumElements(); + + // Widen the data. + EVT DataVT = DataOp.getValueType(); + EVT WideDataVT = EVT::getVectorVT(*DAG.getContext(), + DataVT.getVectorElementType(), NumElts); + DataOp = ModifyToType(DataOp, WideDataVT); + } else + llvm_unreachable("Can't widen this operand of mscatter"); - SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index, + // The mask should be widened as well. + EVT MaskVT = Mask.getValueType(); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), + MaskVT.getVectorElementType(), NumElts); + + Mask = ModifyToType(Mask, WideMaskVT, true); + SDValue Ops[] = {MSC->getChain(), DataOp, Mask, MSC->getBasePtr(), Index, Scale}; return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), - MSC->getMemoryVT(), dl, Ops, + MSC->getMemoryVT(), SDLoc(N), Ops, MSC->getMemOperand()); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -801,13 +801,6 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Provide custom widening for v2f32 setcc. This is really for VLX when - // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to - // type legalization changing the result type to v4i1 during widening. - // It works fine for SSE2 and is probably faster so no need to qualify with - // VLX support. - setOperationAction(ISD::SETCC, MVT::v2i32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); @@ -1732,6 +1725,9 @@ TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT == MVT::v2i32 && Subtarget.hasSSE2()) + return TypeWidenVector; + if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; @@ -17938,11 +17934,6 @@ assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); - // This is being called by type legalization because v2i32 is marked custom - // for result type legalization for v2f32. - if (VTOp0 == MVT::v2i32) - return SDValue(); - if (VT.is128BitVector() && VTOp0.is256BitVector()) { // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the @@ -24773,26 +24764,6 @@ Results.push_back(Res); return; } - case ISD::SETCC: { - // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when - // setCC result type is v2i1 because type legalzation will end up with - // a v4i1 setcc plus an extend. - assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); - if (N->getOperand(0).getValueType() != MVT::v2f32) - return; - SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); - SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(0), UNDEF); - SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(1), UNDEF); - SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, - N->getOperand(2)); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -24839,22 +24810,16 @@ Src, DAG.getIntPtrConstant(0, dl)); } SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); - bool WidenType = getTypeAction(*DAG.getContext(), - MVT::v2i32) == TypeWidenVector; - ResVT = WidenType ? MVT::v4i32 : MVT::v2i32; - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res, + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } if (Src.getValueType() == MVT::v2f32) { - SDValue Idx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -25125,56 +25090,30 @@ Results.push_back(Res.getValue(2)); return; } - if (VT == MVT::v2i32) { + if (VT == MVT::v2i32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); + if (Index.getValueType() != MVT::v2i64) + return; SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Gather->getValue(), DAG.getUNDEF(MVT::v2i32)); - // If the index is v2i64 we can use it directly. - if (Index.getValueType() == MVT::v2i64 && - (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { - if (!Subtarget.hasVLX()) { - // We need to widen the mask, but the instruction will only use 2 - // of its elements. So we can use undef. - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getUNDEF(MVT::v2i1)); - Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); - } - SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), - Index, Gather->getScale() }; - SDValue Res = DAG.getTargetMemSDNode( - DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, - Gather->getMemoryVT(), Gather->getMemOperand()); - SDValue Chain = Res.getValue(2); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } - EVT IndexVT = Index.getValueType(); - EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), - IndexVT.getScalarType(), 4); - // Otherwise we need to custom widen everything to avoid promotion. - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(IndexVT)); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), - Gather->getMemoryVT(), dl, Ops, - Gather->getMemOperand()); - SDValue Chain = Res.getValue(1); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); + SDValue Res = DAG.getTargetMemSDNode( + DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, + Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(2)); return; } break; Index: test/Analysis/CostModel/X86/alternate-shuffle-cost.ll =================================================================== --- test/Analysis/CostModel/X86/alternate-shuffle-cost.ll +++ test/Analysis/CostModel/X86/alternate-shuffle-cost.ll @@ -8,7 +8,7 @@ ; Verify the cost model for alternate shuffles. ; shufflevector instructions with illegal 64-bit vector types. -; 64-bit packed integer vectors (v2i32) are promoted to type v2i64. +; 64-bit packed integer vectors (v2i32) are widened to type v4i32. ; 64-bit packed float vectors (v2f32) are widened to type v4f32. define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) { @@ -16,8 +16,8 @@ ret <2 x i32> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32': -; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector -; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector @@ -38,8 +38,8 @@ ret <2 x i32> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32_2': -; SSE2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector -; SSSE3: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; SSSE3: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector Index: test/Analysis/CostModel/X86/arith.ll =================================================================== --- test/Analysis/CostModel/X86/arith.ll +++ test/Analysis/CostModel/X86/arith.ll @@ -568,12 +568,12 @@ ; A <2 x i32> gets expanded to a <2 x i64> vector. ; A <2 x i64> vector multiply is implemented using ; 3 PMULUDQ and 2 PADDS and 4 shifts. - ; SSSE3: cost of 8 {{.*}} %A0 = mul - ; SSE42: cost of 8 {{.*}} %A0 = mul - ; AVX: cost of 8 {{.*}} %A0 = mul - ; AVX2: cost of 8 {{.*}} %A0 = mul - ; AVX512F: cost of 8 {{.*}} %A0 = mul - ; AVX512BW: cost of 8 {{.*}} %A0 = mul + ; SSSE3: cost of 6 {{.*}} %A0 = mul + ; SSE42: cost of 2 {{.*}} %A0 = mul + ; AVX: cost of 2 {{.*}} %A0 = mul + ; AVX2: cost of 2 {{.*}} %A0 = mul + ; AVX512F: cost of 1 {{.*}} %A0 = mul + ; AVX512BW: cost of 1 {{.*}} %A0 = mul ; AVX512DQ: cost of 1 {{.*}} %A0 = mul %A0 = mul <2 x i32> undef, undef Index: test/Analysis/CostModel/X86/fptoui.ll =================================================================== --- test/Analysis/CostModel/X86/fptoui.ll +++ test/Analysis/CostModel/X86/fptoui.ll @@ -50,7 +50,7 @@ ; SSE42: cost of 6 {{.*}} %V2I32 = fptoui ; AVX1: cost of 6 {{.*}} %V2I32 = fptoui ; AVX2: cost of 6 {{.*}} %V2I32 = fptoui - ; AVX512F: cost of 6 {{.*}} %V2I32 = fptoui + ; AVX512F: cost of 1 {{.*}} %V2I32 = fptoui ; AVX512DQ: cost of 1 {{.*}} %V2I32 = fptoui %V2I32 = fptoui <2 x double> undef to <2 x i32> ; SSE2: cost of 13 {{.*}} %V4I32 = fptoui Index: test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -45,7 +45,7 @@ } ; AVX2-LABEL: test6 -; AVX2: Found an estimated cost of 6 {{.*}}.masked +; AVX2: Found an estimated cost of 5 {{.*}}.masked define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) @@ -61,7 +61,7 @@ } ; AVX2-LABEL: test8 -; AVX2: Found an estimated cost of 6 {{.*}}.masked +; AVX2: Found an estimated cost of 5 {{.*}}.masked define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) Index: test/Analysis/CostModel/X86/sitofp.ll =================================================================== --- test/Analysis/CostModel/X86/sitofp.ll +++ test/Analysis/CostModel/X86/sitofp.ll @@ -70,7 +70,7 @@ ; AVX512: cost of 1 {{.*}} sitofp i32 %cvt_i32_f64 = sitofp i32 undef to double - ; SSE2: cost of 20 {{.*}} sitofp <2 x i32> + ; SSE2: cost of 40 {{.*}} sitofp <2 x i32> ; AVX1: cost of 4 {{.*}} sitofp <2 x i32> ; AVX2: cost of 4 {{.*}} sitofp <2 x i32> ; AVX512: cost of 4 {{.*}} sitofp <2 x i32> Index: test/Analysis/CostModel/X86/slm-arith-costs.ll =================================================================== --- test/Analysis/CostModel/X86/slm-arith-costs.ll +++ test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -195,7 +195,7 @@ define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { entry: -; SLM: cost of 17 {{.*}} mul nsw <2 x i32> +; SLM: cost of 11 {{.*}} mul nsw <2 x i32> %res = mul nsw <2 x i32> %a, %b ret <2 x i32> %res } Index: test/Analysis/CostModel/X86/testshiftashr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftashr.ll +++ test/Analysis/CostModel/X86/testshiftashr.ll @@ -65,7 +65,7 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 12 {{.*}} ashr + ; SSE2: cost of 16 {{.*}} ashr ; SSE2-CODEGEN: shift2i32 ; SSE2-CODEGEN: psrlq @@ -320,7 +320,7 @@ define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { entry: ; SSE2: shift2i32c - ; SSE2: cost of 4 {{.*}} ashr + ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN: shift2i32c ; SSE2-CODEGEN: psrad $3 Index: test/Analysis/CostModel/X86/testshiftlshr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftlshr.ll +++ test/Analysis/CostModel/X86/testshiftlshr.ll @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 4 {{.*}} lshr + ; SSE2: cost of 16 {{.*}} lshr ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrld %0 = lshr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -322,7 +322,7 @@ ; SSE2: shift2i32c ; SSE2: cost of 1 {{.*}} lshr ; SSE2-CODEGEN: shift2i32c - ; SSE2-CODEGEN: psrlq $3 + ; SSE2-CODEGEN: psrld $3 %0 = lshr %shifttypec2i32 %a , ret %shifttypec2i32 %0 Index: test/Analysis/CostModel/X86/testshiftshl.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftshl.ll +++ test/Analysis/CostModel/X86/testshiftshl.ll @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 4 {{.*}} shl + ; SSE2: cost of 10 {{.*}} shl ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: psllq + ; SSE2-CODEGEN: pmuludq %0 = shl %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -322,7 +322,7 @@ ; SSE2: shift2i32c ; SSE2: cost of 1 {{.*}} shl ; SSE2-CODEGEN: shift2i32c - ; SSE2-CODEGEN: psllq $3 + ; SSE2-CODEGEN: pslld $3 %0 = shl %shifttypec2i32 %a , ret %shifttypec2i32 %0 Index: test/Analysis/CostModel/X86/uitofp.ll =================================================================== --- test/Analysis/CostModel/X86/uitofp.ll +++ test/Analysis/CostModel/X86/uitofp.ll @@ -70,7 +70,7 @@ ; AVX512: cost of 1 {{.*}} uitofp i32 %cvt_i32_f64 = uitofp i32 undef to double - ; SSE2: cost of 20 {{.*}} uitofp <2 x i32> + ; SSE2: cost of 40 {{.*}} uitofp <2 x i32> ; AVX1: cost of 6 {{.*}} uitofp <2 x i32> ; AVX2: cost of 6 {{.*}} uitofp <2 x i32> ; AVX512: cost of 1 {{.*}} uitofp <2 x i32> Index: test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll =================================================================== --- test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll +++ test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll @@ -7,7 +7,6 @@ define <2 x double> @a(<2 x i32> %x) nounwind { ; CHECK-LABEL: a: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: retl entry: @@ -19,7 +18,6 @@ ; CHECK-LABEL: b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttpd2dq %xmm0, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: retl entry: %y = fptosi <2 x double> %x to <2 x i32> Index: test/CodeGen/X86/2012-01-18-vbitcast.ll =================================================================== --- test/CodeGen/X86/2012-01-18-vbitcast.ll +++ test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -4,17 +4,9 @@ define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: vcast: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $16, %rsp -; CHECK-NEXT: .seh_stackalloc 16 -; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: movdqa (%rcx), %xmm0 +; CHECK-NEXT: psubd (%rdx), %xmm0 ; CHECK-NEXT: retq -; CHECK-NEXT: .seh_handlerdata -; CHECK-NEXT: .text -; CHECK-NEXT: .seh_endproc %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> %x = sub <2 x i32> %af, %bf Index: test/CodeGen/X86/2012-07-10-extload64.ll =================================================================== --- test/CodeGen/X86/2012-07-10-extload64.ll +++ test/CodeGen/X86/2012-07-10-extload64.ll @@ -22,8 +22,8 @@ ; CHECK-LABEL: store_64: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movlps %xmm0, (%eax) +; CHECK-NEXT: movl $0, 4(%eax) +; CHECK-NEXT: movl $0, (%eax) ; CHECK-NEXT: retl BB: store <2 x i32> zeroinitializer, <2 x i32>* %ptr @@ -34,7 +34,7 @@ ; CHECK-LABEL: load_64: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl BB: %t = load <2 x i32>, <2 x i32>* %ptr Index: test/CodeGen/X86/avx2-masked-gather.ll =================================================================== --- test/CodeGen/X86/avx2-masked-gather.ll +++ test/CodeGen/X86/avx2-masked-gather.ll @@ -9,23 +9,21 @@ define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 -; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i32: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 -; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; NOGATHER-LABEL: masked_gather_v2i32: @@ -44,11 +42,11 @@ ; NOGATHER-NEXT: je .LBB0_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB0_4: # %else2 -; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 -; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0 +; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: %ld = load <2 x i32*>, <2 x i32*>* %ptr @@ -59,11 +57,10 @@ define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32_concat: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl @@ -71,7 +68,6 @@ ; X64-LABEL: masked_gather_v2i32_concat: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 @@ -94,12 +90,11 @@ ; NOGATHER-NEXT: je .LBB1_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB1_4: # %else2 -; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 -; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 -; NOGATHER-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0 +; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: %ld = load <2 x i32*>, <2 x i32*>* %ptr @@ -716,11 +711,17 @@ define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) { ; X86-LABEL: masked_gather_v2i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 +; X86-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, 4(%eax), %xmm2, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm0 +; X86-NEXT: vpgatherdq %ymm0, (,%xmm2), %ymm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i64: @@ -763,11 +764,17 @@ define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) { ; X86-LABEL: masked_gather_v2double: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 +; X86-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, 4(%eax), %xmm2, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm0 +; X86-NEXT: vgatherdpd %ymm0, (,%xmm2), %ymm1 ; X86-NEXT: vmovapd %xmm1, %xmm0 +; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2double: Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -1727,23 +1727,25 @@ ; NOVL: # %bb.0: ; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vpmovqd %zmm0, %ymm0 ; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto2f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto2f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; VLNODQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLNODQ-NEXT: retq %cmpres = fcmp ogt <2 x double> %a, zeroinitializer @@ -2012,29 +2014,42 @@ } define <2 x float> @ubto2f32(<2 x i32> %a) { -; ALL-LABEL: ubto2f32: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { -; ALL-LABEL: ubto2f64: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -2555,16 +2555,16 @@ ; GENERIC-LABEL: sbto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] -; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33] -; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x double> %a, zeroinitializer @@ -2910,19 +2910,15 @@ ; GENERIC-LABEL: ubto2f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> @@ -2933,20 +2929,16 @@ ; GENERIC-LABEL: ubto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer @@ -6745,9 +6737,9 @@ define void @mask16_mem(i16* %ptr) { ; GENERIC-LABEL: mask16_mem: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovw (%rdi), %k0 +; GENERIC-NEXT: kmovw (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovw %k0, (%rdi) +; GENERIC-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask16_mem: @@ -6767,9 +6759,9 @@ define void @mask8_mem(i8* %ptr) { ; GENERIC-LABEL: mask8_mem: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: kmovb (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask8_mem: @@ -6816,8 +6808,8 @@ define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) { ; GENERIC-LABEL: mand16_mem: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovw (%rdi), %k0 -; GENERIC-NEXT: kmovw (%rsi), %k1 +; GENERIC-NEXT: kmovw (%rdi), %k0 # sched: [4:0.50] +; GENERIC-NEXT: kmovw (%rsi), %k1 # sched: [4:0.50] ; GENERIC-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00] ; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00] @@ -6946,7 +6938,7 @@ ; GENERIC-LABEL: conv1: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; GENERIC-NEXT: movb $-2, %al # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7362,7 +7354,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test22: @@ -7380,7 +7372,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test23: @@ -7399,7 +7391,7 @@ ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rsi) +; GENERIC-NEXT: kmovb %k0, (%rsi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v1i1: @@ -7420,7 +7412,7 @@ ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v2i1: @@ -7441,7 +7433,7 @@ ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v4i1: @@ -7462,7 +7454,7 @@ ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v8i1: @@ -7483,7 +7475,7 @@ ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovw %k0, (%rdi) +; GENERIC-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v16i1: @@ -7746,7 +7738,7 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) { ; GENERIC-LABEL: load_8i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: kmovb (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7763,7 +7755,7 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) { ; GENERIC-LABEL: load_16i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovw (%rdi), %k0 +; GENERIC-NEXT: kmovw (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7780,7 +7772,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) { ; GENERIC-LABEL: load_2i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: kmovb (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7797,7 +7789,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; GENERIC-LABEL: load_4i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: kmovb (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7814,7 +7806,7 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) { ; GENERIC-LABEL: load_32i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd (%rdi), %k0 +; GENERIC-NEXT: kmovd (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7831,7 +7823,7 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) { ; GENERIC-LABEL: load_64i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovq (%rdi), %k0 +; GENERIC-NEXT: kmovq (%rdi), %k0 # sched: [4:0.50] ; GENERIC-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7850,7 +7842,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_8i1: @@ -7868,7 +7860,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_8i1_1: @@ -7887,7 +7879,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovw %k0, (%rdi) +; GENERIC-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_16i1: @@ -7905,7 +7897,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovd %k0, (%rdi) +; GENERIC-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7925,7 +7917,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovw2m %zmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovd %k0, (%rdi) +; GENERIC-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7948,7 +7940,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovq %k0, (%rdi) +; GENERIC-NEXT: kmovq %k0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; Index: test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -536,9 +536,7 @@ define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_16xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -547,10 +545,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -562,10 +558,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -576,10 +570,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -591,10 +583,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -605,10 +595,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -620,10 +608,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -634,10 +620,8 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -649,10 +633,8 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -258,9 +258,20 @@ } define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { -; ALL-LABEL: trunc_qd_128: -; ALL: ## %bb.0: -; ALL-NEXT: retq +; KNL-LABEL: trunc_qd_128: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qd_128: +; SKX: ## %bb.0: +; SKX-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %x = trunc <2 x i64> %i to <2 x i32> ret <2 x i32> %x } @@ -268,8 +279,10 @@ define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { ; KNL-LABEL: trunc_qd_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL-NEXT: vmovlps %xmm0, (%rdi) +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qd_128_mem: Index: test/CodeGen/X86/bitcast-and-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-and-setcc-128.ll +++ test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -491,130 +491,44 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: psrad $31, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movmskpd %xmm2, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -333,79 +333,32 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -254,18 +254,17 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) { ; SKX-LABEL: test13: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test13: ; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} @@ -279,18 +278,17 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SKX-LABEL: test14: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test14: ; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} Index: test/CodeGen/X86/cvtv2f32.ll =================================================================== --- test/CodeGen/X86/cvtv2f32.ll +++ test/CodeGen/X86/cvtv2f32.ll @@ -41,11 +41,9 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_buildvector_cvt: ; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X32-NEXT: movapd {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] -; X32-NEXT: orpd %xmm1, %xmm2 +; X32-NEXT: movdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] +; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X32-NEXT: por %xmm1, %xmm2 ; X32-NEXT: subpd %xmm1, %xmm2 ; X32-NEXT: cvtpd2ps %xmm2, %xmm1 ; X32-NEXT: mulps %xmm1, %xmm0 @@ -53,13 +51,13 @@ ; ; X64-LABEL: uitofp_2i32_buildvector_cvt: ; X64: # %bb.0: -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: movd %edi, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-NEXT: movdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] -; X64-NEXT: por %xmm1, %xmm2 -; X64-NEXT: subpd %xmm1, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm1 +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: pinsrd $1, %esi, %xmm1 +; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503600e+15,4.503600e+15] +; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: subpd %xmm2, %xmm1 +; X64-NEXT: cvtpd2ps %xmm1, %xmm1 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = insertelement <2 x i32> undef, i32 %x, i32 0 @@ -72,23 +70,21 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_legalized: ; X32: # %bb.0: -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X32-NEXT: movaps {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15] -; X32-NEXT: orps %xmm0, %xmm2 -; X32-NEXT: subpd %xmm0, %xmm2 -; X32-NEXT: cvtpd2ps %xmm2, %xmm0 +; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X32-NEXT: movdqa {{.*#+}} xmm2 = [4.503600e+15,4.503600e+15] +; X32-NEXT: por %xmm2, %xmm0 +; X32-NEXT: subpd %xmm2, %xmm0 +; X32-NEXT: cvtpd2ps %xmm0, %xmm0 ; X32-NEXT: mulps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: uitofp_2i32_legalized: ; X64: # %bb.0: -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X64-NEXT: movaps {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15] -; X64-NEXT: orps %xmm0, %xmm2 -; X64-NEXT: subpd %xmm0, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm0 +; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503600e+15,4.503600e+15] +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: subpd %xmm2, %xmm0 +; X64-NEXT: cvtpd2ps %xmm0, %xmm0 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = uitofp <2 x i32> %in to <2 x float> Index: test/CodeGen/X86/i64-to-float.ll =================================================================== --- test/CodeGen/X86/i64-to-float.ll +++ test/CodeGen/X86/i64-to-float.ll @@ -16,7 +16,7 @@ ; ; X32-AVX-LABEL: mask_sitofp_2i64_2f64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-AVX-NEXT: retl ; @@ -29,7 +29,7 @@ ; ; X64-AVX-LABEL: mask_sitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq %and = and <2 x i64> %a, @@ -47,7 +47,7 @@ ; ; X32-AVX-LABEL: mask_uitofp_2i64_2f64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-AVX-NEXT: retl ; @@ -60,7 +60,7 @@ ; ; X64-AVX-LABEL: mask_uitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq %and = and <2 x i64> %a, Index: test/CodeGen/X86/insertelement-shuffle.ll =================================================================== --- test/CodeGen/X86/insertelement-shuffle.ll +++ test/CodeGen/X86/insertelement-shuffle.ll @@ -46,18 +46,10 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X32_AVX256-LABEL: insert_subvector_512: ; X32_AVX256: # %bb.0: -; X32_AVX256-NEXT: pushl %ebp -; X32_AVX256-NEXT: movl %esp, %ebp -; X32_AVX256-NEXT: andl $-8, %esp -; X32_AVX256-NEXT: subl $8, %esp -; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32_AVX256-NEXT: vmovlps %xmm2, (%esp) ; X32_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X32_AVX256-NEXT: vpinsrd $0, (%esp), %xmm2, %xmm2 +; X32_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X32_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X32_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X32_AVX256-NEXT: movl %ebp, %esp -; X32_AVX256-NEXT: popl %ebp ; X32_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -10,8 +10,12 @@ ; ; X64-LABEL: signbits_sext_v2i64_sitofp_v2f64: ; X64: # %bb.0: -; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: vmovq %rcx, %xmm0 +; X64-NEXT: vmovq %rax, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq %1 = sext i32 %a0 to i64 @@ -253,7 +257,9 @@ ; X32-NEXT: vpsrad $16, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 ; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X32-NEXT: vpsrad $16, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-NEXT: retl @@ -263,7 +269,9 @@ ; X64-NEXT: vpsrad $16, %xmm0, %xmm1 ; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X64-NEXT: vpsrad $16, %xmm0, %xmm1 ; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq Index: test/CodeGen/X86/lower-bitcast.ll =================================================================== --- test/CodeGen/X86/lower-bitcast.ll +++ test/CodeGen/X86/lower-bitcast.ll @@ -9,9 +9,7 @@ define double @test1(double %A) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test1: @@ -68,9 +66,7 @@ ; CHECK-LABEL: test4: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -919,13 +919,12 @@ ; KNL_64-LABEL: test17: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovapd %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -933,37 +932,36 @@ ; KNL_32-LABEL: test17: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test17: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 +; SKX-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 -; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} ; SKX-NEXT: vmovapd %xmm2, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test17: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 +; SKX_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1} ; SKX_32-NEXT: vmovapd %xmm2, %xmm0 +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1084,8 +1082,8 @@ ; ; KNL_32-LABEL: test20: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1103,7 +1101,6 @@ ; ; SKX_32-LABEL: test20: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} @@ -1117,9 +1114,9 @@ ; KNL_64-LABEL: test21: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1128,10 +1125,10 @@ ; ; KNL_32-LABEL: test21: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1142,7 +1139,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq ; @@ -1150,8 +1146,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) @@ -1165,7 +1159,7 @@ ; KNL_64-LABEL: test22: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 @@ -1178,7 +1172,7 @@ ; KNL_32-LABEL: test22: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1191,7 +1185,6 @@ ; ; SKX-LABEL: test22: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} @@ -1200,7 +1193,6 @@ ; ; SKX_32-LABEL: test22: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1268,28 +1260,28 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1297,10 +1289,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23: @@ -1308,10 +1298,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1322,28 +1310,28 @@ define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23b: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23b: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1351,9 +1339,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23b: @@ -1361,9 +1348,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) @@ -1373,22 +1359,22 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test24: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1396,9 +1382,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test24: @@ -1406,9 +1391,8 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1420,13 +1404,12 @@ ; KNL_64-LABEL: test25: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1434,37 +1417,36 @@ ; KNL_32-LABEL: test25: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test25: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 +; SKX-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %ymm2 {%k1} ; SKX-NEXT: vmovdqa %xmm2, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test25: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 +; SKX_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %ymm2 {%k1} ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind @@ -1476,11 +1458,10 @@ ; KNL_64-LABEL: test26: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1488,33 +1469,34 @@ ; KNL_32-LABEL: test26: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movb $3, %cl ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; SKX-NEXT: movb $3, %al +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %ymm1 {%k1} ; SKX-NEXT: vmovdqa %xmm1, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test26: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 +; SKX_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} +; SKX_32-NEXT: movb $3, %cl +; SKX_32-NEXT: kmovw %ecx, %k1 +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %ymm1 {%k1} ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind @@ -1526,40 +1508,40 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test27: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} +; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test27: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} +; SKX_32-NEXT: vmovaps %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind @@ -1572,7 +1554,7 @@ ; KNL_64-LABEL: test28: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1581,8 +1563,8 @@ ; ; KNL_32-LABEL: test28: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movw $3, %ax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1591,7 +1573,6 @@ ; ; SKX-LABEL: test28: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq @@ -1600,8 +1581,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movb $3, %al ; SKX_32-NEXT: kmovw %eax, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) @@ -2670,48 +2649,46 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) { ; KNL_64-LABEL: test_scatter_2i32_index: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1} +; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_2i32_index: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1} +; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_scatter_2i32_index: ; SKX: # %bb.0: +; SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1 -; SKX-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX-NEXT: vpsraq $32, %xmm1, %xmm1 -; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1} +; SKX-NEXT: vscatterdpd %ymm0, (%rdi,%xmm1,8) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_2i32_index: ; SKX_32: # %bb.0: +; SKX_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1 -; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1} +; SKX_32-NEXT: vscatterdpd %ymm0, (%eax,%xmm1,8) {%k1} +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -515,30 +515,20 @@ } define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { -; AVX1-LABEL: test14: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: test14: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: test14: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: test14: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} @@ -547,9 +537,9 @@ ; ; SKX-LABEL: test14: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -561,41 +551,38 @@ ; AVX1-LABEL: test15: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test15: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test15: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test15: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) @@ -603,32 +590,21 @@ } define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { -; AVX1-LABEL: test16: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test16: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: test16: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: test16: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} @@ -638,9 +614,9 @@ ; ; SKX-LABEL: test16: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -652,48 +628,41 @@ ; AVX1-LABEL: test17: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test17: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test17: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test17: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} -; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) @@ -701,29 +670,19 @@ } define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { -; AVX1-LABEL: test18: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test18: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: test18: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: test18: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} @@ -733,9 +692,9 @@ ; ; SKX-LABEL: test18: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; SKX-NEXT: kshiftlw $14, %k0, %k0 +; SKX-NEXT: kshiftrw $14, %k0, %k1 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer Index: test/CodeGen/X86/mmx-cvt.ll =================================================================== --- test/CodeGen/X86/mmx-cvt.ll +++ test/CodeGen/X86/mmx-cvt.ll @@ -296,8 +296,8 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $32, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movq (%eax), %mm0 ; X86-NEXT: paddd %mm0, %mm0 Index: test/CodeGen/X86/mulvi32.ll =================================================================== --- test/CodeGen/X86/mulvi32.ll +++ test/CodeGen/X86/mulvi32.ll @@ -7,58 +7,39 @@ ; PR6399 define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) { -; SSE-LABEL: _mul2xi32a: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _mul2xi32a: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul2xi32a: +; SSE42: # %bb.0: +; SSE42-NEXT: pmulld %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX-LABEL: _mul2xi32a: ; AVX: # %bb.0: -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = mul <2 x i32> %0, %1 ret <2 x i32> %r } define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) { -; SSE2-LABEL: _mul2xi32b: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: retq -; -; SSE42-LABEL: _mul2xi32b: -; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE42-NEXT: pmuludq %xmm0, %xmm1 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: retq +; SSE-LABEL: _mul2xi32b: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: _mul2xi32b: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %factor0 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> %factor1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -88,42 +88,30 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind { ; SSE2-LABEL: v3i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, 8(%rdi) -; SSE2-NEXT: movq %xmm2, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %xmm2, 8(%rdi) +; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v3i32: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi) -; SSE42-NEXT: movq %xmm1, (%rdi) +; SSE42-NEXT: extractps $1, %xmm0, 8(%rdi) +; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE42-NEXT: movlps %xmm0, (%rdi) ; SSE42-NEXT: retq ; -; AVX1-LABEL: v3i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-NEXT: vextractps $2, %xmm0, 8(%rdi) -; AVX1-NEXT: vmovlps %xmm1, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v3i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vextractps $2, %xmm0, 8(%rdi) -; AVX2-NEXT: vmovlps %xmm1, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi) +; AVX-NEXT: vmovlps %xmm1, (%rdi) +; AVX-NEXT: retq ; ; XOP-LABEL: v3i32: ; XOP: # %bb.0: -; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; XOP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi) +; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; XOP-NEXT: vextractps $1, %xmm0, 8(%rdi) ; XOP-NEXT: vmovlps %xmm1, (%rdi) ; XOP-NEXT: retq %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> Index: test/CodeGen/X86/pointer-vector.ll =================================================================== --- test/CodeGen/X86/pointer-vector.ll +++ test/CodeGen/X86/pointer-vector.ll @@ -117,7 +117,7 @@ ; CHECK-LABEL: BITCAST1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl entry: %G = load <2 x i8*>, <2 x i8*>* %p Index: test/CodeGen/X86/ret-mmx.ll =================================================================== --- test/CodeGen/X86/ret-mmx.ll +++ test/CodeGen/X86/ret-mmx.ll @@ -33,7 +33,7 @@ ; CHECK-LABEL: t3: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retq ret <2 x i32> } Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -902,12 +902,13 @@ ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psadbw %xmm3, %xmm2 -; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; @@ -924,13 +925,15 @@ ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] ; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB3_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vphaddd %xmm1, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: sad_2i8: @@ -946,13 +949,14 @@ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB3_1 ; AVX512F-NEXT: # %bb.2: # %middle.block -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vphaddd %xmm1, %xmm1, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sad_2i8: @@ -968,13 +972,14 @@ ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX512BW-NEXT: addq $4, %rax ; AVX512BW-NEXT: jne .LBB3_1 ; AVX512BW-NEXT: # %bb.2: # %middle.block -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vphaddd %xmm1, %xmm1, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: br label %vector.body Index: test/CodeGen/X86/scalar_widen_div.ll =================================================================== --- test/CodeGen/X86/scalar_widen_div.ll +++ test/CodeGen/X86/scalar_widen_div.ll @@ -13,20 +13,19 @@ ; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: pmovsxdq (%rdi,%rcx,8), %xmm0 -; CHECK-NEXT: pmovsxdq (%rsi,%rcx,8), %xmm1 -; CHECK-NEXT: pextrq $1, %xmm0, %rax -; CHECK-NEXT: pextrq $1, %xmm1, %rsi -; CHECK-NEXT: cqto -; CHECK-NEXT: idivq %rsi -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: movq %xmm1, %rsi -; CHECK-NEXT: cqto -; CHECK-NEXT: idivq %rsi -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: pextrd $1, %xmm0, %eax +; CHECK-NEXT: pextrd $1, %xmm1, %esi +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %esi +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movd %xmm1, %edi +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %edi +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: pinsrd $1, %esi, %xmm0 ; CHECK-NEXT: movq %xmm0, (%r8,%rcx,8) ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -33,25 +33,37 @@ ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: movzbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movzbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8: @@ -72,10 +84,15 @@ ; X64-AVX-LABEL: mul_2xi8: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -492,7 +509,9 @@ ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -510,7 +529,8 @@ ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; @@ -947,25 +967,37 @@ ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_sext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movsbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movsbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_sext: @@ -988,10 +1020,15 @@ ; X64-AVX-LABEL: mul_2xi8_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movsbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1039,25 +1076,37 @@ ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_sext_zext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_sext_zext: @@ -1081,10 +1130,15 @@ ; X64-AVX-LABEL: mul_2xi8_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1126,25 +1180,37 @@ ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_sext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movswl 2(%eax,%ecx), %edx +; X86-AVX-NEXT: movswl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_sext: @@ -1162,10 +1228,15 @@ ; X64-AVX-LABEL: mul_2xi16_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movswl 2(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1204,43 +1275,41 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: paddq %xmm2, %xmm3 -; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: paddq %xmm3, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm3, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_sext_zext: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-NEXT: .cfi_offset %esi, -12 +; X86-AVX-NEXT: .cfi_offset %edi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_sext_zext: @@ -1249,34 +1318,29 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: psrlq $32, %xmm3 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE-NEXT: paddq %xmm2, %xmm3 -; X64-SSE-NEXT: psllq $32, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: paddq %xmm3, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1479,18 +1543,27 @@ ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst1: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst1: @@ -1508,12 +1581,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: movl $255, %ecx -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1547,18 +1619,27 @@ ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst2: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst2: @@ -1577,9 +1658,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1615,18 +1698,27 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst3: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst3: @@ -1647,12 +1739,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1688,18 +1779,27 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst4: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst4: @@ -1720,9 +1820,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1758,18 +1860,27 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst5: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst5: @@ -1790,9 +1901,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst5: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1828,18 +1941,27 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst6: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi8_varconst6: @@ -1860,9 +1982,11 @@ ; X64-AVX-LABEL: mul_2xi8_varconst6: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1895,7 +2019,9 @@ ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst1: @@ -1905,10 +2031,9 @@ ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst1: @@ -1928,12 +2053,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1966,18 +2086,27 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst2: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst2: @@ -1995,9 +2124,11 @@ ; X64-AVX-LABEL: mul_2xi16_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2027,16 +2158,11 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm1, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst3: @@ -2046,10 +2172,9 @@ ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst3: @@ -2058,17 +2183,12 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-SSE-NEXT: movq %rcx, %xmm1 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -2077,12 +2197,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2112,27 +2227,29 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm1, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst4: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst4: @@ -2141,29 +2258,23 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-SSE-NEXT: movq %rcx, %xmm1 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-AVX-NEXT: vmovq %rcx, %xmm1 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: Index: test/CodeGen/X86/shuffle-strided-with-offset-128.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -144,29 +144,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S Index: test/CodeGen/X86/shuffle-vs-trunc-128.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -247,29 +247,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S @@ -283,16 +265,36 @@ ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v2i64_to_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v2i64_to_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vmovlps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovlps %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v2i64_to_v2i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovlps %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_to_v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_to_v2i32: @@ -303,8 +305,10 @@ ; ; AVX512BW-LABEL: trunc_v2i64_to_v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: Index: test/CodeGen/X86/sse-fsignum.ll =================================================================== --- test/CodeGen/X86/sse-fsignum.ll +++ test/CodeGen/X86/sse-fsignum.ll @@ -33,19 +33,51 @@ } define void @signum64a(<2 x double>*) { -; AVX-LABEL: signum64a: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd (%rdi), %xmm0 -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovapd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: signum64a: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd (%rdi), %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovapd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: signum64a: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovapd (%rdi), %xmm0 +; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovapd %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: signum64a: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovapd (%rdi), %xmm0 +; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vmovapd %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq entry: %1 = load <2 x double>, <2 x double>* %0 %2 = fcmp olt <2 x double> %1, zeroinitializer Index: test/CodeGen/X86/trunc-ext-ld-st.ll =================================================================== --- test/CodeGen/X86/trunc-ext-ld-st.ll +++ test/CodeGen/X86/trunc-ext-ld-st.ll @@ -61,22 +61,12 @@ } define void @load_2_i32(<2 x i32>* %A) { -; SSE2-LABEL: load_2_i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: load_2_i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq +; CHECK-LABEL: load_2_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdi) +; CHECK-NEXT: retq %T = load <2 x i32>, <2 x i32>* %A %G = add <2 x i32> %T, store <2 x i32> %G, <2 x i32>* %A Index: test/CodeGen/X86/trunc-subvector.ll =================================================================== --- test/CodeGen/X86/trunc-subvector.ll +++ test/CodeGen/X86/trunc-subvector.ll @@ -40,26 +40,14 @@ define <2 x i32> @test3(<8 x i32> %v) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX2-LABEL: test3: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test3: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test3: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -69,24 +57,13 @@ define <2 x i32> @test4(<8 x i32> %v) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test4: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -99,11 +76,8 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test5: @@ -112,7 +86,8 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,2,4,6,4,6,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -120,9 +95,11 @@ ; AVX512-LABEL: test5: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> @@ -167,25 +144,14 @@ define <2 x i32> @test8(<8 x i32> %v) { ; SSE2-LABEL: test8: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX2-LABEL: test8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test8: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -195,23 +161,13 @@ define <2 x i32> @test9(<8 x i32> %v) { ; SSE2-LABEL: test9: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test9: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test9: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test9: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -221,19 +177,16 @@ define <2 x i32> @test10(<8 x i32> %v) { ; SSE2-LABEL: test10: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <3,4,u,u,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -241,9 +194,11 @@ ; AVX512-LABEL: test10: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -111,28 +111,32 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promtz: ; CHECK: # %bb.0: -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: psubq %xmm0, %xmm2 +; CHECK-NEXT: psubd %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddd %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrld $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubd %xmm0, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] ; CHECK-NEXT: movdqa %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm3 +; CHECK-NEXT: psrld $2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddd %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: psrld $4, %xmm0 +; CHECK-NEXT: paddd %xmm3, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -141,44 +145,44 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promlz: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $1, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $2, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $4, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $8, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $16, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrld $2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $4, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrld $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrld $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm2 +; CHECK-NEXT: psubd %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm0, %xmm1 +; CHECK-NEXT: psrld $2, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddq %xmm3, %xmm2 +; CHECK-NEXT: paddd %xmm1, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: psrld $4, %xmm0 +; CHECK-NEXT: paddd %xmm2, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: psadbw %xmm1, %xmm0 -; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -188,23 +192,27 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind { ; CHECK-LABEL: prompop: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: psubd %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrld $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: paddd %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: psrld $4, %xmm1 +; CHECK-NEXT: paddd %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psadbw %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: psadbw %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: packuswb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) Index: test/CodeGen/X86/vec_extract-avx.ll =================================================================== --- test/CodeGen/X86/vec_extract-avx.ll +++ test/CodeGen/X86/vec_extract-avx.ll @@ -117,9 +117,7 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl Index: test/CodeGen/X86/vec_extract-mmx.ll =================================================================== --- test/CodeGen/X86/vec_extract-mmx.ll +++ test/CodeGen/X86/vec_extract-mmx.ll @@ -125,12 +125,10 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %ebp ; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $32, %esp ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -138,9 +136,7 @@ ; X64-LABEL: test4: ; X64: # %bb.0: ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq %tmp0 = bitcast x86_mmx %a to <2 x i32> %tmp1 = extractelement <2 x i32> %tmp0, i32 1 Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -93,13 +93,11 @@ ; SSE-LABEL: fptosi_2f64_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> ret <2 x i32> %cvt @@ -338,52 +336,26 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_4i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; VEX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_4i32: @@ -419,50 +391,25 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: subsd %xmm1, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm1, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_2i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; VEX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i32: @@ -498,34 +445,17 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovbq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_4f64_to_2i32: @@ -752,46 +682,20 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_4f64_to_4i32: @@ -849,13 +753,11 @@ ; SSE-LABEL: fptosi_2f32_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f32_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %cvt @@ -1234,77 +1136,64 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f32_to_2i32: ; VEX: # %bb.0: -; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm1, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vmovd %ecx, %xmm1 +; VEX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; VEX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; VEX-NEXT: vcvttss2si %xmm2, %rax +; VEX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VLDQ-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %cvt @@ -2242,7 +2131,8 @@ ; SSE-LABEL: fptosi_2f16_to_4i32: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rax -; SSE-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee @@ -2252,20 +2142,20 @@ ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f16_to_4i32: ; VEX: # %bb.0: ; VEX-NEXT: pushq %rax -; VEX-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill +; VEX-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; VEX-NEXT: vmovaps %xmm1, %xmm0 ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee @@ -2275,27 +2165,27 @@ ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vcvttss2si (%rsp), %rax # 4-byte Folded Reload -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; VEX-NEXT: vcvttss2si %xmm0, %eax +; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vmovd %eax, %xmm1 +; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: popq %rax ; VEX-NEXT: retq ; ; AVX512-LABEL: fptosi_2f16_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm1, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2si %xmm0, %eax +; AVX512-NEXT: vcvttss2si %xmm1, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> @@ -2312,32 +2202,31 @@ ; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f80_to_4i32: ; AVX: # %bb.0: ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq %cvt = fptosi <2 x x86_fp80> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> @@ -2347,51 +2236,44 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; SSE-LABEL: fptosi_2f128_to_4i32: ; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: movq %rdi, %rbx -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movl %eax, %ebp ; SSE-NEXT: movq %rbx, %rdi ; SSE-NEXT: movq %r14, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f128_to_4i32: ; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: movq %rsi, %r14 -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: movl %eax, %ebp ; AVX-NEXT: movq %rbx, %rdi ; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebp, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %rbp ; AVX-NEXT: retq %cvt = fptosi <2 x fp128> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -10,18 +10,15 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $12, %ecx -; X32-NEXT: movd %ecx, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movl %ecx, 4(%eax) +; X32-NEXT: movl $0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: t1: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $12, %edi -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq %tmp12 = shl i32 %a, 12 Index: test/CodeGen/X86/vec_insert-7.ll =================================================================== --- test/CodeGen/X86/vec_insert-7.ll +++ test/CodeGen/X86/vec_insert-7.ll @@ -8,27 +8,27 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X32-LABEL: mmx_movzl: ; X32: ## %bb.0: -; X32-NEXT: subl $20, %esp +; X32-NEXT: subl $44, %esp ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) -; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: movl $32, %eax ; X32-NEXT: pinsrd $0, %eax, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm1 -; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; X32-NEXT: movq %xmm1, (%esp) +; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; X32-NEXT: movdqa %xmm1, (%esp) ; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: addl $20, %esp +; X32-NEXT: addl $44, %esp ; X32-NEXT: retl ; ; X64-LABEL: mmx_movzl: ; X64: ## %bb.0: ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 ; X64-NEXT: movl $32, %eax -; X64-NEXT: pinsrq $0, %rax, %xmm1 +; X64-NEXT: pinsrd $0, %eax, %xmm1 ; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; X64-NEXT: retq %tmp = bitcast x86_mmx %x to <2 x i32> %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -6,20 +6,18 @@ define x86_mmx @t0(i32 %A) nounwind { ; X32-LABEL: t0: ; X32: ## %bb.0: -; X32-NEXT: subl $12, %esp +; X32-NEXT: subl $28, %esp ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X32-NEXT: movq %xmm0, (%esp) +; X32-NEXT: movdqa %xmm0, (%esp) ; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: addl $12, %esp +; X32-NEXT: addl $28, %esp ; X32-NEXT: retl ; ; X64-LABEL: t0: ; X64: ## %bb.0: -; X64-NEXT: ## kill: def $edi killed $edi def $rdi -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X64-NEXT: retq %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -2653,8 +2653,10 @@ ; ; AVX-LABEL: sitofp_load_2i16_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movswl 2(%rdi), %eax +; AVX-NEXT: movswl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i16>, <2 x i16> *%a @@ -2675,8 +2677,10 @@ ; ; AVX-LABEL: sitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a @@ -3011,8 +3015,10 @@ ; ; AVX-LABEL: uitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a Index: test/CodeGen/X86/vec_zero_cse.ll =================================================================== --- test/CodeGen/X86/vec_zero_cse.ll +++ test/CodeGen/X86/vec_zero_cse.ll @@ -15,14 +15,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl $0, M1+4 ; X32-NEXT: movl $0, M1 -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: movlps %xmm0, M2 +; X32-NEXT: movl $0, M2+4 +; X32-NEXT: movl $0, M2 ; X32-NEXT: retl ; ; X64-LABEL: test1: ; X64: # %bb.0: ; X64-NEXT: movq $0, {{.*}}(%rip) -; X64-NEXT: movq $0, {{.*}}(%rip) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movlps %xmm0, {{.*}}(%rip) ; X64-NEXT: retq store <1 x i64> zeroinitializer, <1 x i64>* @M1 store <2 x i32> zeroinitializer, <2 x i32>* @M2 @@ -34,15 +35,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl $-1, M1+4 ; X32-NEXT: movl $-1, M1 -; X32-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-NEXT: movq %xmm0, M2 +; X32-NEXT: movl $-1, M2+4 +; X32-NEXT: movl $-1, M2 ; X32-NEXT: retl ; ; X64-LABEL: test2: ; X64: # %bb.0: ; X64-NEXT: movq $-1, {{.*}}(%rip) -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq %rax, {{.*}}(%rip) +; X64-NEXT: pcmpeqd %xmm0, %xmm0 +; X64-NEXT: movq %xmm0, {{.*}}(%rip) ; X64-NEXT: retq store <1 x i64> < i64 -1 >, <1 x i64>* @M1 store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2 Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -5051,8 +5051,7 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_2i8_to_2i32: @@ -5061,27 +5060,35 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movsbl 1(%rdi), %eax +; SSE41-NEXT: movsbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X32-SSE41-LABEL: sext_2i8_to_2i32: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 -; X32-SSE41-NEXT: paddq %xmm0, %xmm0 +; X32-SSE41-NEXT: movsbl 1(%eax), %ecx +; X32-SSE41-NEXT: movsbl (%eax), %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: paddd %xmm0, %xmm0 ; X32-SSE41-NEXT: retl %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = sext <2 x i8> %x to <2 x i32> Index: test/CodeGen/X86/vector-shuffle-mmx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-mmx.ll +++ test/CodeGen/X86/vector-shuffle-mmx.ll @@ -8,9 +8,8 @@ ; X32-LABEL: test0: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-NEXT: movlps %xmm0, (%eax) +; X32-NEXT: movl 4(%eax), %ecx +; X32-NEXT: movl %ecx, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test0: @@ -73,9 +72,8 @@ ; X32-LABEL: test2: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl L_tmp_V2i$non_lazy_ptr, %eax -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X32-NEXT: movlps %xmm0, (%eax) +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl %ecx, 4(%eax) ; X32-NEXT: retl ; ; X64-LABEL: test2: Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -1613,15 +1613,73 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; -; AVX-LABEL: trunc2x2i64_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: retq +; AVX1-LABEL: trunc2x2i64_4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: retq ; -; AVX512-LABEL: trunc2x2i64_4i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512-NEXT: retq +; AVX2-SLOW-LABEL: trunc2x2i64_4i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x2i64_4i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2x2i64_4i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x2i64_4i32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x2i64_4i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x2i64_4i32: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq entry: %0 = trunc <2 x i64> %a to <2 x i32> %1 = trunc <2 x i64> %b to <2 x i32> @@ -1636,34 +1694,60 @@ ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; -; AVX-LABEL: trunc2i64_i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: trunc2i64_i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2i64_i64: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovq %xmm0, %rax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2i64_i64: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovq %xmm0, %rax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc2i64_i64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc2i64_i64: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc2i64_i64: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2i64_i64: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) -; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <2 x i64> %inval to <2 x i32> Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -2271,28 +2271,35 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_2i8_to_2i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[3],zero,zero,zero -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: zext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = zext <2 x i8> %x to <2 x i32> Index: test/CodeGen/X86/vshift-4.ll =================================================================== --- test/CodeGen/X86/vshift-4.ll +++ test/CodeGen/X86/vshift-4.ll @@ -58,7 +58,7 @@ ; X32-LABEL: shift2a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -67,7 +67,7 @@ ; ; X64-LABEL: shift2a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -84,7 +84,7 @@ ; X32-LABEL: shift2b: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -93,7 +93,7 @@ ; ; X64-LABEL: shift2b: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -110,7 +110,7 @@ ; X32-LABEL: shift2c: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -119,7 +119,7 @@ ; ; X64-LABEL: shift2c: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 Index: test/CodeGen/X86/widen_arith-3.ll =================================================================== --- test/CodeGen/X86/widen_arith-3.ll +++ test/CodeGen/X86/widen_arith-3.ll @@ -11,8 +11,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $40, %esp +; CHECK-NEXT: subl $32, %esp ; CHECK-NEXT: movl {{\.LCPI.*}}, %eax ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 @@ -26,8 +27,10 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl 12(%ebp), %edx ; CHECK-NEXT: movl 8(%ebp), %ecx -; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; CHECK-NEXT: movl (%edx,%eax,8), %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm2 ; CHECK-NEXT: psubd %xmm0, %xmm2 ; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8) @@ -40,7 +43,8 @@ ; CHECK-NEXT: cmpl 16(%ebp), %eax ; CHECK-NEXT: jl .LBB0_2 ; CHECK-NEXT: # %bb.3: # %afterfor -; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: leal -4(%ebp), %esp +; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl entry: Index: test/CodeGen/X86/widen_cast-5.ll =================================================================== --- test/CodeGen/X86/widen_cast-5.ll +++ test/CodeGen/X86/widen_cast-5.ll @@ -8,18 +8,17 @@ ; X86-LABEL: convert: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: pxor LCPI0_0, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: pextrd $1, %xmm0, 4(%eax) +; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movq %rsi, %xmm0 -; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: pxor {{.*}}(%rip), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/widen_conv-1.ll =================================================================== --- test/CodeGen/X86/widen_conv-1.ll +++ test/CodeGen/X86/widen_conv-1.ll @@ -8,15 +8,18 @@ ; X86-LABEL: convert_v2i64_to_v2i32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: psubd %xmm1, %xmm0 +; X86-NEXT: pextrd $1, %xmm0, 4(%eax) +; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i64_to_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: psubd %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -65,20 +68,14 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind { ; X86-LABEL: convert_v5i16_to_v5i8: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqa (%ecx), %xmm0 ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubw %xmm1, %xmm0 ; X86-NEXT: pextrb $8, %xmm0, 4(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; X86-NEXT: movd %xmm0, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: convert_v5i16_to_v5i8: Index: test/CodeGen/X86/widen_conv-2.ll =================================================================== --- test/CodeGen/X86/widen_conv-2.ll +++ test/CodeGen/X86/widen_conv-2.ll @@ -7,18 +7,19 @@ define void @convert_v2i16_v2i32(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind { ; X86-LABEL: convert_v2i16_v2i32: ; X86: # %bb.0: # %entry +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: psllq $48, %xmm0 +; X86-NEXT: pslld $16, %xmm0 ; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: pextrd $1, %xmm0, 4(%eax) +; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i16_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/widen_conv-3.ll =================================================================== --- test/CodeGen/X86/widen_conv-3.ll +++ test/CodeGen/X86/widen_conv-3.ll @@ -9,10 +9,10 @@ define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind { ; X86-SSE2-LABEL: convert_v2i16_to_v2f32: ; X86-SSE2: # %bb.0: # %entry +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: psllq $48, %xmm0 +; X86-SSE2-NEXT: pslld $16, %xmm0 ; X86-SSE2-NEXT: psrad $16, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -21,10 +21,10 @@ ; ; X86-SSE42-LABEL: convert_v2i16_to_v2f32: ; X86-SSE42: # %bb.0: # %entry +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE42-NEXT: psllq $48, %xmm0 +; X86-SSE42-NEXT: pslld $16, %xmm0 ; X86-SSE42-NEXT: psrad $16, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) ; X86-SSE42-NEXT: movss %xmm0, (%eax) @@ -32,9 +32,9 @@ ; ; X64-LABEL: convert_v2i16_to_v2f32: ; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-NEXT: movlps %xmm0, (%rdi) ; X64-NEXT: retq Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -150,22 +150,28 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %esi ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $24, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-NEXT: pinsrd $2, 4(%edx), %xmm0 -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, (%esp) +; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-NEXT: pinsrd $2, 4(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 ; X86-NEXT: pextrw $4, %xmm1, 4(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; X86-NEXT: movd %xmm1, (%eax) -; X86-NEXT: movl %ebp, %esp +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi ; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; Index: test/CodeGen/X86/widened-broadcast.ll =================================================================== --- test/CodeGen/X86/widened-broadcast.ll +++ test/CodeGen/X86/widened-broadcast.ll @@ -597,22 +597,10 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_8i32_2i32_0101: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8i32_2i32_0101: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8i32_2i32_0101: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8i32_2i32_0101: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -630,9 +618,7 @@ ; ; AVX1-LABEL: load_splat_16i32_2i32_0101: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX1-NEXT: vmovaps %ymm0, %ymm1 ; AVX1-NEXT: retq ; @@ -644,9 +630,7 @@ ; ; AVX512-LABEL: load_splat_16i32_2i32_0101: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 ; AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> Index: test/CodeGen/X86/x86-shifts.ll =================================================================== --- test/CodeGen/X86/x86-shifts.ll +++ test/CodeGen/X86/x86-shifts.ll @@ -254,16 +254,16 @@ ; X32-LABEL: shl2_other: ; X32: # %bb.0: # %entry ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psllq $2, %xmm1 -; X32-NEXT: psllq $9, %xmm0 +; X32-NEXT: pslld $2, %xmm1 +; X32-NEXT: pslld $9, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shl2_other: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psllq $2, %xmm1 -; X64-NEXT: psllq $9, %xmm0 +; X64-NEXT: pslld $2, %xmm1 +; X64-NEXT: pslld $9, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: @@ -276,19 +276,17 @@ define <2 x i32> @shr2_other(<2 x i32> %A) nounwind { ; X32-LABEL: shr2_other: ; X32: # %bb.0: # %entry -; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psrlq $8, %xmm1 -; X32-NEXT: psrlq $1, %xmm0 +; X32-NEXT: psrld $8, %xmm1 +; X32-NEXT: psrld $1, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shr2_other: ; X64: # %bb.0: # %entry -; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $8, %xmm1 -; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: psrld $8, %xmm1 +; X64-NEXT: psrld $1, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -342,18 +342,30 @@ ; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 ; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0 -; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0 -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]] -; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 -; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2 -; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 +; ZEROTHRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] +; ZEROTHRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] +; ZEROTHRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 +; ZEROTHRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 +; ZEROTHRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 +; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 +; ZEROTHRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 +; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 ; ZEROTHRESH-NEXT: ret <4 x float> [[RD]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -430,18 +442,12 @@ ; CHECK-NEXT: ret <2 x float> [[RB]] ; ; ZEROTHRESH-LABEL: @simple_select_v2( -; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1 -; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1 -; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]] +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1 ; ZEROTHRESH-NEXT: ret <2 x float> [[RB]] ; %c0 = extractelement <2 x i32> %c, i32 0