Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -66,7 +66,7 @@ STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt ExperimentalVectorWideningLegalization( - "x86-experimental-vector-widening-legalization", cl::init(false), + "x86-experimental-vector-widening-legalization", cl::init(true), cl::desc("Enable an experimental vector type legalization through widening " "rather than promotion."), cl::Hidden); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -887,7 +887,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. - // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); // Treat Transpose as 2-op shuffles - there's no difference in lowering. @@ -2425,14 +2425,6 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, bool IsPairwise) { - - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); - - MVT MTy = LT.second; - - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. @@ -2440,7 +2432,10 @@ { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32. { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16 + { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16 { ISD::ADD, MVT::v8i16, 5 }, }; @@ -2449,8 +2444,11 @@ { ISD::FADD, MVT::v4f64, 5 }, { ISD::FADD, MVT::v8f32, 7 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". + { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16 + { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16 { ISD::ADD, MVT::v8i16, 5 }, { ISD::ADD, MVT::v8i32, 5 }, }; @@ -2459,7 +2457,10 @@ { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". + { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". }; @@ -2468,12 +2469,47 @@ { ISD::FADD, MVT::v4f64, 3 }, { ISD::FADD, MVT::v8f32, 4 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". { ISD::ADD, MVT::v4i64, 3 }, + { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". { ISD::ADD, MVT::v8i16, 4 }, { ISD::ADD, MVT::v8i32, 5 }, }; + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // Before legalizing the type, give a chance to look up illegal narrow types + // in the table. + // FIXME: Is there a better way to do this? + EVT VT = TLI->getValueType(DL, ValTy); + if (VT.isSimple()) { + MVT MTy = VT.getSimpleVT(); + if (IsPairwise) { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return Entry->Cost; + } else { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + } + } + + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + if (IsPairwise) { if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) Index: llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll +++ llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll @@ -18,9 +18,21 @@ ; 64-bit packed float vectors (v2f32) are widened to type v4f32. define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) { -; CHECK-LABEL: 'test_v2i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; SSE2-LABEL: 'test_v2i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSSE3-LABEL: 'test_v2i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSE42-LABEL: 'test_v2i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; AVX-LABEL: 'test_v2i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 ; ; BTVER2-LABEL: 'test_v2i32' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> @@ -56,9 +68,21 @@ } define <2 x i32> @test_v2i32_2(<2 x i32> %a, <2 x i32> %b) { -; CHECK-LABEL: 'test_v2i32_2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; SSE2-LABEL: 'test_v2i32_2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSSE3-LABEL: 'test_v2i32_2' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; SSE42-LABEL: 'test_v2i32_2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; +; AVX-LABEL: 'test_v2i32_2' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 ; ; BTVER2-LABEL: 'test_v2i32_2' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> Index: llvm/test/Analysis/CostModel/X86/arith.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/arith.ll +++ llvm/test/Analysis/CostModel/X86/arith.ll @@ -1342,36 +1342,32 @@ ; A <2 x i64> vector multiply is implemented using ; 3 PMULUDQ and 2 PADDS and 4 shifts. define void @mul_2i32() { -; SSE-LABEL: 'mul_2i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-LABEL: 'mul_2i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %A0 = mul <2 x i32> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'mul_2i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'mul_2i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512F-LABEL: 'mul_2i32' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512BW-LABEL: 'mul_2i32' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512DQ-LABEL: 'mul_2i32' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A0 = mul <2 x i32> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-LABEL: 'mul_2i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A0 = mul <2 x i32> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'mul_2i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %A0 = mul <2 x i32> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %A0 = mul <2 x i32> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; GLM-LABEL: 'mul_2i32' -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; BTVER2-LABEL: 'mul_2i32' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %A0 = mul <2 x i32> undef, undef Index: llvm/test/Analysis/CostModel/X86/cast.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/cast.ll +++ llvm/test/Analysis/CostModel/X86/cast.ll @@ -315,10 +315,10 @@ ; SSE-LABEL: 'sitofp4' ; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -359,7 +359,7 @@ define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { ; SSE-LABEL: 'sitofp8' ; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -390,9 +390,9 @@ ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -433,7 +433,7 @@ define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { ; SSE-LABEL: 'uitofp8' ; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void Index: llvm/test/Analysis/CostModel/X86/fptosi.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/fptosi.ll +++ llvm/test/Analysis/CostModel/X86/fptosi.ll @@ -92,35 +92,28 @@ define i32 @fptosi_double_i16(i32 %arg) { ; SSE-LABEL: 'fptosi_double_i16' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> -; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> -; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptosi_double_i16' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'fptosi_double_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'fptosi_double_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'fptosi_double_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'fptosi_double_i16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef @@ -143,29 +136,22 @@ ; AVX-LABEL: 'fptosi_double_i8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 ; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'fptosi_double_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'fptosi_double_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'fptosi_double_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'fptosi_double_i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = fptosi double undef to i8 @@ -285,9 +271,9 @@ define i32 @fptosi_float_i8(i32 %arg) { ; SSE-LABEL: 'fptosi_float_i8' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptosi_float_i8' Index: llvm/test/Analysis/CostModel/X86/fptoui.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/fptoui.ll +++ llvm/test/Analysis/CostModel/X86/fptoui.ll @@ -68,19 +68,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'fptoui_double_i32' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'fptoui_double_i32' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'fptoui_double_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'fptoui_double_i32' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 @@ -106,30 +99,23 @@ ; ; AVX-LABEL: 'fptoui_double_i16' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> -; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'fptoui_double_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'fptoui_double_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'fptoui_double_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'fptoui_double_i16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I16 = fptoui double undef to i16 @@ -154,19 +140,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'fptoui_double_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'fptoui_double_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'fptoui_double_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'fptoui_double_i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 @@ -277,7 +256,7 @@ ; ; AVX-LABEL: 'fptoui_float_i16' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef @@ -291,7 +270,7 @@ ; ; BTVER2-LABEL: 'fptoui_float_i16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef @@ -314,8 +293,8 @@ ; AVX-LABEL: 'fptoui_float_i8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 ; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'fptoui_float_i8' @@ -328,8 +307,8 @@ ; BTVER2-LABEL: 'fptoui_float_i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = fptoui float undef to i8 Index: llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -52,7 +52,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) @@ -79,7 +79,7 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) @@ -106,15 +106,15 @@ ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) @@ -194,7 +194,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) @@ -221,7 +221,7 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) @@ -248,15 +248,15 @@ ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) @@ -960,15 +960,10 @@ } define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { -; SSE2-LABEL: 'test5' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; SSE42-LABEL: 'test5' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE-LABEL: 'test5' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test5' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -986,24 +981,19 @@ } define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { -; SSE2-LABEL: 'test6' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; SSE42-LABEL: 'test6' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE-LABEL: 'test6' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test6' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test6' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -1012,15 +1002,10 @@ } define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { -; SSE2-LABEL: 'test7' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res -; -; SSE42-LABEL: 'test7' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res +; SSE-LABEL: 'test7' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res ; ; AVX-LABEL: 'test7' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -1038,24 +1023,19 @@ } define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { -; SSE2-LABEL: 'test8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res -; -; SSE42-LABEL: 'test8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res +; SSE-LABEL: 'test8' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; AVX-LABEL: 'test8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; AVX512-LABEL: 'test8' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; %mask = icmp eq <2 x i32> %trigger, zeroinitializer Index: llvm/test/Analysis/CostModel/X86/reduce-add-widen.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-add-widen.ll +++ llvm/test/Analysis/CostModel/X86/reduce-add-widen.ll @@ -75,7 +75,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) @@ -83,7 +83,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) @@ -126,8 +126,8 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) @@ -135,8 +135,8 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) @@ -144,8 +144,8 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) @@ -153,8 +153,8 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) @@ -162,8 +162,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) @@ -171,8 +171,8 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-add.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-add.ll +++ llvm/test/Analysis/CostModel/X86/reduce-add.ll @@ -83,7 +83,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) @@ -108,8 +108,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) @@ -135,7 +135,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) @@ -144,7 +144,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) @@ -153,7 +153,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) @@ -162,7 +162,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) @@ -171,7 +171,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) @@ -190,9 +190,9 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) @@ -210,9 +210,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) @@ -220,9 +220,9 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) @@ -230,9 +230,9 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) @@ -240,9 +240,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) @@ -250,9 +250,9 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) @@ -260,9 +260,9 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-and.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-and.ll +++ llvm/test/Analysis/CostModel/X86/reduce-and.ll @@ -92,8 +92,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) @@ -174,9 +174,9 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-mul.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-mul.ll +++ llvm/test/Analysis/CostModel/X86/reduce-mul.ll @@ -67,7 +67,7 @@ define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) @@ -75,7 +75,7 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) @@ -83,7 +83,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) @@ -99,36 +99,20 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'reduce_i32' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'reduce_i32' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'reduce_i32' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) @@ -140,8 +124,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) @@ -149,8 +133,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) @@ -158,8 +142,8 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) @@ -167,8 +151,8 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) @@ -176,8 +160,8 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) @@ -185,7 +169,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) @@ -194,7 +178,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) @@ -222,9 +206,9 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) @@ -232,9 +216,9 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) @@ -242,9 +226,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) @@ -252,9 +236,9 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) @@ -262,9 +246,9 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) @@ -272,9 +256,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) @@ -282,9 +266,9 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) @@ -292,9 +276,9 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-or.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-or.ll +++ llvm/test/Analysis/CostModel/X86/reduce-or.ll @@ -92,8 +92,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) @@ -174,9 +174,9 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-smax.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-smax.ll +++ llvm/test/Analysis/CostModel/X86/reduce-smax.ll @@ -83,7 +83,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) @@ -99,7 +99,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) @@ -107,7 +107,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) @@ -124,8 +124,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) @@ -133,8 +133,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) @@ -142,7 +142,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) @@ -151,7 +151,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) @@ -160,7 +160,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) @@ -169,7 +169,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) @@ -178,7 +178,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) @@ -187,7 +187,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) @@ -206,8 +206,8 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) @@ -216,8 +216,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) @@ -226,9 +226,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) @@ -236,9 +236,9 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) @@ -246,9 +246,9 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) @@ -256,9 +256,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) @@ -266,9 +266,9 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) @@ -276,9 +276,9 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-smin.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-smin.ll +++ llvm/test/Analysis/CostModel/X86/reduce-smin.ll @@ -83,7 +83,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) @@ -99,7 +99,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) @@ -107,7 +107,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) @@ -124,8 +124,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) @@ -133,8 +133,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) @@ -142,7 +142,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) @@ -151,7 +151,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) @@ -160,7 +160,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) @@ -169,7 +169,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) @@ -178,7 +178,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) @@ -187,7 +187,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) @@ -206,8 +206,8 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) @@ -216,8 +216,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) @@ -226,9 +226,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) @@ -236,9 +236,9 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) @@ -246,9 +246,9 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) @@ -256,9 +256,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) @@ -266,9 +266,9 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) @@ -276,9 +276,9 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-umax.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-umax.ll +++ llvm/test/Analysis/CostModel/X86/reduce-umax.ll @@ -83,7 +83,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) @@ -99,7 +99,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) @@ -107,7 +107,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) @@ -124,8 +124,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) @@ -133,8 +133,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) @@ -142,7 +142,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) @@ -151,7 +151,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) @@ -160,7 +160,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) @@ -169,7 +169,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) @@ -178,7 +178,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) @@ -187,7 +187,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) @@ -206,9 +206,9 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) @@ -216,9 +216,9 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) @@ -226,9 +226,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) @@ -236,9 +236,9 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) @@ -246,9 +246,9 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) @@ -256,9 +256,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) @@ -266,9 +266,9 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) @@ -276,9 +276,9 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-umin.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-umin.ll +++ llvm/test/Analysis/CostModel/X86/reduce-umin.ll @@ -83,7 +83,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) @@ -99,7 +99,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) @@ -107,7 +107,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) @@ -124,8 +124,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) @@ -133,8 +133,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) @@ -142,7 +142,7 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) @@ -151,7 +151,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) @@ -160,7 +160,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) @@ -169,7 +169,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) @@ -178,7 +178,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) @@ -187,7 +187,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) @@ -206,9 +206,9 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) @@ -216,9 +216,9 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) @@ -226,9 +226,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) @@ -236,9 +236,9 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) @@ -246,9 +246,9 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) @@ -256,9 +256,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) @@ -266,9 +266,9 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) @@ -276,9 +276,9 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/reduce-xor.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-xor.ll +++ llvm/test/Analysis/CostModel/X86/reduce-xor.ll @@ -92,8 +92,8 @@ define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) @@ -174,9 +174,9 @@ define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) Index: llvm/test/Analysis/CostModel/X86/shuffle-transpose.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/shuffle-transpose.ll +++ llvm/test/Analysis/CostModel/X86/shuffle-transpose.ll @@ -123,21 +123,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i32> %b128, <8 x i32> %a256, <8 x i32> %b256, <16 x i32> %a512, <16 x i32> %b512) { ; SSE-LABEL: 'test_vXi32' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> @@ -151,7 +151,7 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; BTVER2-LABEL: 'test_vXi32' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> Index: llvm/test/Analysis/CostModel/X86/sitofp.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/sitofp.ll +++ llvm/test/Analysis/CostModel/X86/sitofp.ll @@ -13,9 +13,9 @@ define i32 @sitofp_i8_double() { ; SSE-LABEL: 'sitofp_i8_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'sitofp_i8_double' @@ -49,8 +49,8 @@ define i32 @sitofp_i16_double() { ; SSE-LABEL: 'sitofp_i16_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -85,7 +85,7 @@ define i32 @sitofp_i32_double() { ; SSE-LABEL: 'sitofp_i32_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef @@ -164,8 +164,8 @@ define i32 @sitofp_i8_float() { ; SSE-LABEL: 'sitofp_i8_float' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -200,7 +200,7 @@ define i32 @sitofp_i16_float() { ; SSE-LABEL: 'sitofp_i16_float' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef Index: llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll +++ llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -47,11 +47,11 @@ define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) { ; SLM-LABEL: 'slm-costs_8_v2_mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = mul nsw <2 x i8> %a, %b +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = mul nsw <2 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %res ; ; GLM-LABEL: 'slm-costs_8_v2_mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = mul nsw <2 x i8> %a, %b +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = mul nsw <2 x i8> %a, %b ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %res ; entry: @@ -61,11 +61,11 @@ define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b) { ; SLM-LABEL: 'slm-costs_8_v4_mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = mul nsw <4 x i8> %a, %b +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = mul nsw <4 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %res ; ; GLM-LABEL: 'slm-costs_8_v4_mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <4 x i8> %a, %b +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = mul nsw <4 x i8> %a, %b ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %res ; entry: @@ -177,11 +177,11 @@ define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b) { ; SLM-LABEL: 'slm-costs_8_v8_mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <8 x i8> %a, %b +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = mul nsw <8 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res ; ; GLM-LABEL: 'slm-costs_8_v8_mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = mul nsw <8 x i8> %a, %b +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = mul nsw <8 x i8> %a, %b ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res ; entry: @@ -216,11 +216,11 @@ define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) { ; SLM-LABEL: 'slm-costs_16_v2_mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = mul nsw <2 x i16> %a, %b +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <2 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %res ; ; GLM-LABEL: 'slm-costs_16_v2_mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = mul nsw <2 x i16> %a, %b +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = mul nsw <2 x i16> %a, %b ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %res ; entry: @@ -230,11 +230,11 @@ define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b) { ; SLM-LABEL: 'slm-costs_16_v4_mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = mul nsw <4 x i16> %a, %b +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <4 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %res ; ; GLM-LABEL: 'slm-costs_16_v4_mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <4 x i16> %a, %b +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = mul nsw <4 x i16> %a, %b ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %res ; entry: @@ -385,11 +385,11 @@ define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { ; SLM-LABEL: 'slm-costs_32_v2_mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = mul nsw <2 x i32> %a, %b +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = mul nsw <2 x i32> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; GLM-LABEL: 'slm-costs_32_v2_mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = mul nsw <2 x i32> %a, %b +; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <2 x i32> %a, %b ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; entry: Index: llvm/test/Analysis/CostModel/X86/testshiftashr.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/testshiftashr.ll +++ llvm/test/Analysis/CostModel/X86/testshiftashr.ll @@ -5,9 +5,9 @@ define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { entry: ; SSE2-LABEL: shift2i16 - ; SSE2: cost of 12 {{.*}} ashr + ; SSE2: cost of 32 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift2i16 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psraw %0 = ashr %shifttype %a , %b ret %shifttype %0 @@ -17,9 +17,9 @@ define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { entry: ; SSE2-LABEL: shift4i16 - ; SSE2: cost of 16 {{.*}} ashr + ; SSE2: cost of 32 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift4i16 - ; SSE2-CODEGEN: psrad + ; SSE2-CODEGEN: psraw %0 = ashr %shifttype4i16 %a , %b ret %shifttype4i16 %0 @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2-LABEL: shift2i32 - ; SSE2: cost of 12 {{.*}} ashr + ; SSE2: cost of 16 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift2i32 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrad %0 = ashr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -185,9 +185,9 @@ define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { entry: ; SSE2-LABEL: shift2i8 - ; SSE2: cost of 12 {{.*}} ashr + ; SSE2: cost of 54 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift2i8 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrlw %0 = ashr %shifttype2i8 %a , %b ret %shifttype2i8 %0 @@ -197,9 +197,9 @@ define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { entry: ; SSE2-LABEL: shift4i8 - ; SSE2: cost of 16 {{.*}} ashr + ; SSE2: cost of 54 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift4i8 - ; SSE2-CODEGEN: psrad + ; SSE2-CODEGEN: psraw %0 = ashr %shifttype4i8 %a , %b ret %shifttype4i8 %0 @@ -209,7 +209,7 @@ define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { entry: ; SSE2-LABEL: shift8i8 - ; SSE2: cost of 32 {{.*}} ashr + ; SSE2: cost of 54 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift8i8 ; SSE2-CODEGEN: psraw @@ -247,9 +247,9 @@ define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { entry: ; SSE2-LABEL: shift2i16const - ; SSE2: cost of 4 {{.*}} ashr + ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift2i16const - ; SSE2-CODEGEN: psrad $3 + ; SSE2-CODEGEN: psraw $3 %0 = ashr %shifttypec %a , ret %shifttypec %0 @@ -261,7 +261,7 @@ ; SSE2-LABEL: shift4i16const ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift4i16const - ; SSE2-CODEGEN: psrad $19 + ; SSE2-CODEGEN: psraw $3 %0 = ashr %shifttypec4i16 %a , ret %shifttypec4i16 %0 @@ -320,7 +320,7 @@ define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { entry: ; SSE2-LABEL: shift2i32c - ; SSE2: cost of 4 {{.*}} ashr + ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift2i32c ; SSE2-CODEGEN: psrad $3 @@ -464,7 +464,7 @@ ; SSE2-LABEL: shift2i8c ; SSE2: cost of 4 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift2i8c - ; SSE2-CODEGEN: psrad $3 + ; SSE2-CODEGEN: psrlw $3 %0 = ashr %shifttypec2i8 %a , ret %shifttypec2i8 %0 @@ -474,9 +474,9 @@ define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { entry: ; SSE2-LABEL: shift4i8c - ; SSE2: cost of 1 {{.*}} ashr + ; SSE2: cost of 4 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift4i8c - ; SSE2-CODEGEN: psrad $27 + ; SSE2-CODEGEN: psrlw $3 %0 = ashr %shifttypec4i8 %a , ret %shifttypec4i8 %0 @@ -486,9 +486,9 @@ define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { entry: ; SSE2-LABEL: shift8i8c - ; SSE2: cost of 1 {{.*}} ashr + ; SSE2: cost of 4 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift8i8c - ; SSE2-CODEGEN: psraw $11 + ; SSE2-CODEGEN: psrlw $3 %0 = ashr %shifttypec8i8 %a , Index: llvm/test/Analysis/CostModel/X86/testshiftlshr.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/testshiftlshr.ll +++ llvm/test/Analysis/CostModel/X86/testshiftlshr.ll @@ -5,9 +5,9 @@ define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { entry: ; SSE2-LABEL: shift2i16 - ; SSE2: cost of 4 {{.*}} lshr + ; SSE2: cost of 32 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift2i16 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrlw %0 = lshr %shifttype %a , %b ret %shifttype %0 @@ -17,9 +17,9 @@ define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { entry: ; SSE2-LABEL: shift4i16 - ; SSE2: cost of 16 {{.*}} lshr + ; SSE2: cost of 32 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift4i16 - ; SSE2-CODEGEN: psrld + ; SSE2-CODEGEN: psrlw %0 = lshr %shifttype4i16 %a , %b ret %shifttype4i16 %0 @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2-LABEL: shift2i32 - ; SSE2: cost of 4 {{.*}} lshr + ; SSE2: cost of 16 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift2i32 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrld %0 = lshr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -185,9 +185,9 @@ define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { entry: ; SSE2-LABEL: shift2i8 - ; SSE2: cost of 4 {{.*}} lshr + ; SSE2: cost of 26 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift2i8 - ; SSE2-CODEGEN: psrlq + ; SSE2-CODEGEN: psrlw %0 = lshr %shifttype2i8 %a , %b ret %shifttype2i8 %0 @@ -197,9 +197,9 @@ define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { entry: ; SSE2-LABEL: shift4i8 - ; SSE2: cost of 16 {{.*}} lshr + ; SSE2: cost of 26 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift4i8 - ; SSE2-CODEGEN: psrld + ; SSE2-CODEGEN: psrlw %0 = lshr %shifttype4i8 %a , %b ret %shifttype4i8 %0 @@ -209,7 +209,7 @@ define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { entry: ; SSE2-LABEL: shift8i8 - ; SSE2: cost of 32 {{.*}} lshr + ; SSE2: cost of 26 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift8i8 ; SSE2-CODEGEN: psrlw @@ -249,7 +249,7 @@ ; SSE2-LABEL: shift2i16const ; SSE2: cost of 1 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift2i16const - ; SSE2-CODEGEN: psrlq $3 + ; SSE2-CODEGEN: psrlw $3 %0 = lshr %shifttypec %a , ret %shifttypec %0 @@ -261,7 +261,7 @@ ; SSE2-LABEL: shift4i16const ; SSE2: cost of 1 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift4i16const - ; SSE2-CODEGEN: psrld $3 + ; SSE2-CODEGEN: psrlw $3 %0 = lshr %shifttypec4i16 %a , ret %shifttypec4i16 %0 @@ -322,7 +322,7 @@ ; SSE2-LABEL: shift2i32c ; SSE2: cost of 1 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift2i32c - ; SSE2-CODEGEN: psrlq $3 + ; SSE2-CODEGEN: psrld $3 %0 = lshr %shifttypec2i32 %a , ret %shifttypec2i32 %0 @@ -461,9 +461,9 @@ define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { entry: ; SSE2-LABEL: shift2i8c - ; SSE2: cost of 1 {{.*}} lshr + ; SSE2: cost of 2 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift2i8c - ; SSE2-CODEGEN: psrlq $3 + ; SSE2-CODEGEN: psrlw $3 %0 = lshr %shifttypec2i8 %a , ret %shifttypec2i8 %0 @@ -473,9 +473,9 @@ define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { entry: ; SSE2-LABEL: shift4i8c - ; SSE2: cost of 1 {{.*}} lshr + ; SSE2: cost of 2 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift4i8c - ; SSE2-CODEGEN: psrld $3 + ; SSE2-CODEGEN: psrlw $3 %0 = lshr %shifttypec4i8 %a , ret %shifttypec4i8 %0 @@ -485,7 +485,7 @@ define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { entry: ; SSE2-LABEL: shift8i8c - ; SSE2: cost of 1 {{.*}} lshr + ; SSE2: cost of 2 {{.*}} lshr ; SSE2-CODEGEN-LABEL: shift8i8c ; SSE2-CODEGEN: psrlw $3 Index: llvm/test/Analysis/CostModel/X86/testshiftshl.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/testshiftshl.ll +++ llvm/test/Analysis/CostModel/X86/testshiftshl.ll @@ -5,9 +5,9 @@ define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { entry: ; SSE2-LABEL: shift2i16 - ; SSE2: cost of 4 {{.*}} shl + ; SSE2: cost of 32 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift2i16 - ; SSE2-CODEGEN: psllq + ; SSE2-CODEGEN: pmullw %0 = shl %shifttype %a , %b ret %shifttype %0 @@ -17,9 +17,9 @@ define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { entry: ; SSE2-LABEL: shift4i16 - ; SSE2: cost of 10 {{.*}} shl + ; SSE2: cost of 32 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift4i16 - ; SSE2-CODEGEN: pmuludq + ; SSE2-CODEGEN: pmullw %0 = shl %shifttype4i16 %a , %b ret %shifttype4i16 %0 @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2-LABEL: shift2i32 - ; SSE2: cost of 4 {{.*}} shl + ; SSE2: cost of 10 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift2i32 - ; SSE2-CODEGEN: psllq + ; SSE2-CODEGEN: pmuludq %0 = shl %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -185,9 +185,9 @@ define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { entry: ; SSE2-LABEL: shift2i8 - ; SSE2: cost of 4 {{.*}} shl + ; SSE2: cost of 26 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift2i8 - ; SSE2-CODEGEN: psllq + ; SSE2-CODEGEN: psllw %0 = shl %shifttype2i8 %a , %b ret %shifttype2i8 %0 @@ -197,9 +197,9 @@ define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { entry: ; SSE2-LABEL: shift4i8 - ; SSE2: cost of 10 {{.*}} shl + ; SSE2: cost of 26 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift4i8 - ; SSE2-CODEGEN: pmuludq + ; SSE2-CODEGEN: psllw %0 = shl %shifttype4i8 %a , %b ret %shifttype4i8 %0 @@ -209,9 +209,9 @@ define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { entry: ; SSE2-LABEL: shift8i8 - ; SSE2: cost of 32 {{.*}} shl + ; SSE2: cost of 26 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift8i8 - ; SSE2-CODEGEN: pmullw + ; SSE2-CODEGEN: psllw %0 = shl %shifttype8i8 %a , %b ret %shifttype8i8 %0 @@ -249,7 +249,7 @@ ; SSE2-LABEL: shift2i16const ; SSE2: cost of 1 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift2i16const - ; SSE2-CODEGEN: psllq $3 + ; SSE2-CODEGEN: psllw $3 %0 = shl %shifttypec %a , ret %shifttypec %0 @@ -261,7 +261,7 @@ ; SSE2-LABEL: shift4i16const ; SSE2: cost of 1 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift4i16const - ; SSE2-CODEGEN: pslld $3 + ; SSE2-CODEGEN: psllw $3 %0 = shl %shifttypec4i16 %a , ret %shifttypec4i16 %0 @@ -322,7 +322,7 @@ ; SSE2-LABEL: shift2i32c ; SSE2: cost of 1 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift2i32c - ; SSE2-CODEGEN: psllq $3 + ; SSE2-CODEGEN: pslld $3 %0 = shl %shifttypec2i32 %a , ret %shifttypec2i32 %0 @@ -461,9 +461,9 @@ define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { entry: ; SSE2-LABEL: shift2i8c - ; SSE2: cost of 1 {{.*}} shl + ; SSE2: cost of 2 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift2i8c - ; SSE2-CODEGEN: psllq $3 + ; SSE2-CODEGEN: psllw $3 %0 = shl %shifttypec2i8 %a , ret %shifttypec2i8 %0 @@ -473,9 +473,9 @@ define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { entry: ; SSE2-LABEL: shift4i8c - ; SSE2: cost of 1 {{.*}} shl + ; SSE2: cost of 2 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift4i8c - ; SSE2-CODEGEN: pslld $3 + ; SSE2-CODEGEN: psllw $3 %0 = shl %shifttypec4i8 %a , ret %shifttypec4i8 %0 @@ -485,7 +485,7 @@ define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { entry: ; SSE2-LABEL: shift8i8c - ; SSE2: cost of 1 {{.*}} shl + ; SSE2: cost of 2 {{.*}} shl ; SSE2-CODEGEN-LABEL: shift8i8c ; SSE2-CODEGEN: psllw $3 Index: llvm/test/Analysis/CostModel/X86/uitofp.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/uitofp.ll +++ llvm/test/Analysis/CostModel/X86/uitofp.ll @@ -13,9 +13,9 @@ define i32 @uitofp_i8_double() { ; SSE-LABEL: 'uitofp_i8_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'uitofp_i8_double' @@ -49,8 +49,8 @@ define i32 @uitofp_i16_double() { ; SSE-LABEL: 'uitofp_i16_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -85,7 +85,7 @@ define i32 @uitofp_i32_double() { ; SSE-LABEL: 'uitofp_i32_double' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef @@ -165,7 +165,7 @@ ; SSE-LABEL: 'uitofp_i8_float' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -200,7 +200,7 @@ define i32 @uitofp_i16_float() { ; SSE-LABEL: 'uitofp_i16_float' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef Index: llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll =================================================================== --- llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll +++ llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll @@ -7,7 +7,6 @@ define <2 x double> @a(<2 x i32> %x) nounwind { ; CHECK-LABEL: a: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: retl entry: @@ -19,7 +18,6 @@ ; CHECK-LABEL: b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttpd2dq %xmm0, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: retl entry: %y = fptosi <2 x double> %x to <2 x i32> Index: llvm/test/CodeGen/X86/2009-06-05-VZextByteShort.ll =================================================================== --- llvm/test/CodeGen/X86/2009-06-05-VZextByteShort.ll +++ llvm/test/CodeGen/X86/2009-06-05-VZextByteShort.ll @@ -7,6 +7,7 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl (%eax), %eax ; CHECK-NEXT: shrl %eax +; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retl @@ -40,7 +41,7 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl (%eax), %eax ; CHECK-NEXT: shrl %eax -; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retl Index: llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll =================================================================== --- llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -17,19 +17,23 @@ define i32 @main() nounwind uwtable { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pmovsxbq {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pmovsxbq {{.*}}(%rip), %xmm1 -; CHECK-NEXT: pextrq $1, %xmm1, %rax -; CHECK-NEXT: pextrq $1, %xmm0, %rcx -; CHECK-NEXT: cqto -; CHECK-NEXT: idivq %rcx -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: movq %xmm1, %rax -; CHECK-NEXT: movq %xmm0, %rcx -; CHECK-NEXT: cqto -; CHECK-NEXT: idivq %rcx -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: pextrb $1, %xmm0, %eax +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: pextrb $1, %xmm1, %ecx +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: cbtw +; CHECK-NEXT: idivb %cl +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: pextrb $0, %xmm0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: cbtw +; CHECK-NEXT: pextrb $0, %xmm1, %edx +; CHECK-NEXT: idivb %dl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: pinsrb $1, %ecx, %xmm0 ; CHECK-NEXT: pextrw $0, %xmm0, {{.*}}(%rip) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/2011-12-28-vselecti8.ll =================================================================== --- llvm/test/CodeGen/X86/2011-12-28-vselecti8.ll +++ llvm/test/CodeGen/X86/2011-12-28-vselecti8.ll @@ -18,10 +18,11 @@ define void @foo8(float* nocapture %RET) nounwind { ; CHECK-LABEL: foo8: ; CHECK: ## %bb.0: ## %allocas -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+2,2.0E+0,1.0E+2,4.0E+0] -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+2,6.0E+0,1.0E+2,8.0E+0] -; CHECK-NEXT: movups %xmm1, 16(%rdi) -; CHECK-NEXT: movups %xmm0, (%rdi) +; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+2,2.0E+0,1.0E+2,4.0E+0] +; CHECK-NEXT: movups %xmm1, (%rdi) +; CHECK-NEXT: movups %xmm0, 16(%rdi) ; CHECK-NEXT: retq allocas: %resultvec.i = select <8 x i1> , <8 x i8> , <8 x i8> Index: llvm/test/CodeGen/X86/2011-12-8-bitcastintprom.ll =================================================================== --- llvm/test/CodeGen/X86/2011-12-8-bitcastintprom.ll +++ llvm/test/CodeGen/X86/2011-12-8-bitcastintprom.ll @@ -6,16 +6,12 @@ define void @prom_bug(<4 x i8> %t, i16* %p) { ; SSE2-LABEL: prom_bug: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: prom_bug: ; SSE41: ## %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE41-NEXT: retq %r = bitcast <4 x i8> %t to <2 x i16> Index: llvm/test/CodeGen/X86/2012-01-18-vbitcast.ll =================================================================== --- llvm/test/CodeGen/X86/2012-01-18-vbitcast.ll +++ llvm/test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -4,9 +4,8 @@ define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: vcast: ; CHECK: # %bb.0: -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: psubq %xmm1, %xmm0 +; CHECK-NEXT: movdqa (%rcx), %xmm0 +; CHECK-NEXT: psubd (%rdx), %xmm0 ; CHECK-NEXT: retq %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> Index: llvm/test/CodeGen/X86/2012-03-15-build_vector_wl.ll =================================================================== --- llvm/test/CodeGen/X86/2012-03-15-build_vector_wl.ll +++ llvm/test/CodeGen/X86/2012-03-15-build_vector_wl.ll @@ -4,7 +4,6 @@ define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone { ; CHECK-LABEL: build_vector_again: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: retq entry: %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> Index: llvm/test/CodeGen/X86/2012-07-10-extload64.ll =================================================================== --- llvm/test/CodeGen/X86/2012-07-10-extload64.ll +++ llvm/test/CodeGen/X86/2012-07-10-extload64.ll @@ -33,7 +33,7 @@ ; CHECK-LABEL: load_64: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl BB: %t = load <2 x i32>, <2 x i32>* %ptr Index: llvm/test/CodeGen/X86/3dnow-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/3dnow-intrinsics.ll +++ llvm/test/CodeGen/X86/3dnow-intrinsics.ll @@ -14,8 +14,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pavgusb %mm1, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast x86_mmx %a.coerce to <8 x i8> @@ -52,8 +51,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: pf2id %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -169,8 +167,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpeq %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -209,8 +206,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpge %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -249,8 +245,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm1 ; X64-NEXT: pfcmpgt %mm0, %mm1 ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -685,8 +680,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pmulhrw %mm1, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast x86_mmx %a.coerce to <4 x i16> @@ -723,8 +717,7 @@ ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: pf2iw %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x float> %a to x86_mmx @@ -896,12 +889,10 @@ ; ; X64-LABEL: test_pswapdsi: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: pswapd %mm0, %mm0 # mm0 = mm0[1,0] ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: %0 = bitcast <2 x i32> %a to x86_mmx Index: llvm/test/CodeGen/X86/4char-promote.ll =================================================================== --- llvm/test/CodeGen/X86/4char-promote.ll +++ llvm/test/CodeGen/X86/4char-promote.ll @@ -7,8 +7,11 @@ define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: pmulld %xmm0, %xmm1 -; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: pmullw %xmm1, %xmm2 +; CHECK-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: retq entry: %binop = mul <4 x i8> %x, %y Index: llvm/test/CodeGen/X86/and-load-fold.ll =================================================================== --- llvm/test/CodeGen/X86/and-load-fold.ll +++ llvm/test/CodeGen/X86/and-load-fold.ll @@ -6,10 +6,8 @@ define i8 @foo(<4 x i8>* %V) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pextrw $1, %xmm0, %eax +; CHECK-NEXT: movb 2(%rdi), %al ; CHECK-NEXT: andb $95, %al -; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %Vp = bitcast <4 x i8>* %V to <3 x i8>* %V3i8 = load <3 x i8>, <3 x i8>* %Vp, align 4 Index: llvm/test/CodeGen/X86/atomic-unordered.ll =================================================================== --- llvm/test/CodeGen/X86/atomic-unordered.ll +++ llvm/test/CodeGen/X86/atomic-unordered.ll @@ -460,7 +460,7 @@ ; CHECK-O0-LABEL: vec_store: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: vmovd %xmm0, %eax -; CHECK-O0-NEXT: vpextrd $2, %xmm0, %ecx +; CHECK-O0-NEXT: vpextrd $1, %xmm0, %ecx ; CHECK-O0-NEXT: movl %eax, (%rdi) ; CHECK-O0-NEXT: movl %ecx, 4(%rdi) ; CHECK-O0-NEXT: retq @@ -468,7 +468,7 @@ ; CHECK-O3-LABEL: vec_store: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: vmovd %xmm0, %eax -; CHECK-O3-NEXT: vpextrd $2, %xmm0, %ecx +; CHECK-O3-NEXT: vpextrd $1, %xmm0, %ecx ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq @@ -485,7 +485,7 @@ ; CHECK-O0-LABEL: vec_store_unaligned: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: vmovd %xmm0, %eax -; CHECK-O0-NEXT: vpextrd $2, %xmm0, %ecx +; CHECK-O0-NEXT: vpextrd $1, %xmm0, %ecx ; CHECK-O0-NEXT: movl %eax, (%rdi) ; CHECK-O0-NEXT: movl %ecx, 4(%rdi) ; CHECK-O0-NEXT: retq @@ -493,7 +493,7 @@ ; CHECK-O3-LABEL: vec_store_unaligned: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: vmovd %xmm0, %eax -; CHECK-O3-NEXT: vpextrd $2, %xmm0, %ecx +; CHECK-O3-NEXT: vpextrd $1, %xmm0, %ecx ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq Index: llvm/test/CodeGen/X86/avg.ll =================================================================== --- llvm/test/CodeGen/X86/avg.ll +++ llvm/test/CodeGen/X86/avg.ll @@ -378,63 +378,65 @@ ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm5, %ymm9, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 -; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm5[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqu %xmm1, (%rax) @@ -1897,118 +1899,178 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps (%rdi), %xmm0 -; SSE2-NEXT: movaps (%rsi), %xmm1 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps (%rdi), %xmm1 +; SSE2-NEXT: movaps (%rsi), %xmm0 +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: addq %r11, %rbp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE2-NEXT: addq %r10, %r14 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: addq %r9, %rbx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE2-NEXT: addq %r8, %r11 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: addq %rdx, %r10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: addq %rcx, %r8 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: addq %rax, %rdi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: addq %rsi, %rdx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leal -1(%rdx,%rsi), %edx -; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leal -1(%rbx,%rdx), %edx -; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leal -1(%rbp,%rdx), %edx -; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leal -1(%rdi,%rdx), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leal -1(%rax,%rdx), %edi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: leal -1(%rcx,%rax), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: leal -1(%r9,%rax), %ecx +; SSE2-NEXT: leaq -1(%r15,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leal -1(%r10,%rsi), %eax +; SSE2-NEXT: leaq -1(%r12,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%r11,%rsi), %rsi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: leaq -1(%r12,%rbx), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: leaq -1(%r15,%rbx), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: leaq -1(%r14,%rbx), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rbx), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rbx), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: leaq -1(%r13,%rbx), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE2-NEXT: leaq -1(%r13,%rbx), %rbx -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: shrl %ecx -; SSE2-NEXT: movd %ecx, %xmm15 -; SSE2-NEXT: shrl %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: shrl %edi -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: shrl %r8d -; SSE2-NEXT: movd %r8d, %xmm10 -; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: shrq %rsi -; SSE2-NEXT: movd %esi, %xmm12 -; SSE2-NEXT: shrq %r12 -; SSE2-NEXT: movd %r12d, %xmm3 -; SSE2-NEXT: shrq %r15 -; SSE2-NEXT: movd %r15d, %xmm13 -; SSE2-NEXT: shrq %r14 -; SSE2-NEXT: movd %r14d, %xmm7 -; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movd %r11d, %xmm14 -; SSE2-NEXT: shrq %r10 -; SSE2-NEXT: movd %r10d, %xmm5 -; SSE2-NEXT: shrq %r9 -; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: shrq %rbx -; SSE2-NEXT: movd %ebx, %xmm1 +; SSE2-NEXT: leaq -1(%r13,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi +; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi +; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: adcq $-1, %r9 +; SSE2-NEXT: addq $-1, %r14 +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: adcq $-1, %rsi +; SSE2-NEXT: addq $-1, %rbx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq $-1, %r11 +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: adcq $-1, %r12 +; SSE2-NEXT: addq $-1, %r10 +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: adcq $-1, %r13 +; SSE2-NEXT: addq $-1, %r8 +; SSE2-NEXT: movl $0, %r15d +; SSE2-NEXT: adcq $-1, %r15 +; SSE2-NEXT: addq $-1, %rdi +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: adcq $-1, %rcx +; SSE2-NEXT: addq $-1, %rdx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: shldq $63, %rdx, %rax +; SSE2-NEXT: shldq $63, %rdi, %rcx +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shldq $63, %r8, %r15 +; SSE2-NEXT: shldq $63, %r10, %r13 +; SSE2-NEXT: shldq $63, %r11, %r12 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; SSE2-NEXT: shldq $63, %rbx, %rdi +; SSE2-NEXT: shldq $63, %r14, %rsi +; SSE2-NEXT: shldq $63, %rbp, %r9 +; SSE2-NEXT: movq %r9, %xmm8 +; SSE2-NEXT: movq %rsi, %xmm15 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm9 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm10 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm4 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm11 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm7 +; SSE2-NEXT: movq %rdi, %xmm12 +; SSE2-NEXT: movq %r12, %xmm0 +; SSE2-NEXT: movq %r13, %xmm13 +; SSE2-NEXT: movq %r15, %xmm6 +; SSE2-NEXT: movq %rdx, %xmm14 +; SSE2-NEXT: movq %rax, %xmm5 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm3 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm15[0,1,2,0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm8 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pslld $16, %xmm6 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE2-NEXT: movdqu %xmm4, (%rax) +; SSE2-NEXT: psllq $48, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE2-NEXT: movupd %xmm4, (%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -2025,118 +2087,181 @@ ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-NEXT: vpextrq $1, %xmm7, %r15 -; AVX1-NEXT: vmovq %xmm7, %r14 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX1-NEXT: vpextrq $1, %xmm4, %r11 -; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vmovd %xmm6, %ecx -; AVX1-NEXT: vpextrd $1, %xmm6, %edx -; AVX1-NEXT: vpextrd $2, %xmm6, %r13d -; AVX1-NEXT: vpextrd $3, %xmm6, %r12d -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vmovd %xmm1, %ebx -; AVX1-NEXT: vpextrd $1, %xmm1, %ebp -; AVX1-NEXT: vpextrd $2, %xmm1, %esi -; AVX1-NEXT: vpextrd $3, %xmm1, %edi -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-NEXT: vmovd %xmm7, %r8d -; AVX1-NEXT: leal -1(%r12,%rdi), %eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrd $2, %xmm7, %eax -; AVX1-NEXT: leal -1(%r13,%rsi), %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrd $2, %xmm4, %edi -; AVX1-NEXT: leal -1(%rdx,%rbp), %edx -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrd $3, %xmm4, %edx -; AVX1-NEXT: leal -1(%rcx,%rbx), %r10d -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: leal -1(%rdx,%rcx), %r9d -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: leal -1(%rdi,%rcx), %edi -; AVX1-NEXT: vpextrd $2, %xmm5, %ecx -; AVX1-NEXT: leal -1(%rax,%rcx), %eax -; AVX1-NEXT: vmovd %xmm5, %ecx -; AVX1-NEXT: leal -1(%r8,%rcx), %r8d -; AVX1-NEXT: vpextrq $1, %xmm6, %rdx -; AVX1-NEXT: leal -1(%r15,%rdx), %r15d -; AVX1-NEXT: vmovq %xmm6, %rdx -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: leal -1(%r14,%rdx), %r14d -; AVX1-NEXT: vpextrq $1, %xmm1, %rdx -; AVX1-NEXT: leal -1(%r11,%rdx), %edx -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: leal -1(%rsi,%rcx), %ecx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX1-NEXT: leal -1(%rbp,%rsi), %esi -; AVX1-NEXT: vmovq %xmm1, %rbx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX1-NEXT: leal -1(%rbp,%rbx), %ebx -; AVX1-NEXT: vpextrq $1, %xmm8, %r11 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpextrq $1, %xmm0, %r12 -; AVX1-NEXT: leal -1(%r11,%r12), %r11d -; AVX1-NEXT: vmovq %xmm8, %r12 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: vpextrq $1, %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vmovq %xmm6, %r10 +; AVX1-NEXT: vpextrq $1, %xmm6, %r9 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX1-NEXT: vmovq %xmm7, %r8 +; AVX1-NEXT: vpextrq $1, %xmm7, %rdi +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm6, %rcx +; AVX1-NEXT: vmovq %xmm6, %r14 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vmovq %xmm0, %rbp +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpextrq $1, %xmm0, %r11 +; AVX1-NEXT: vmovq %xmm0, %r15 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero +; AVX1-NEXT: vpextrq $1, %xmm2, %rbx +; AVX1-NEXT: vmovq %xmm2, %rdx +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: addq %rcx, %rsi ; AVX1-NEXT: vmovq %xmm0, %r13 -; AVX1-NEXT: leal -1(%r12,%r13), %ebp -; AVX1-NEXT: shrl %ebp -; AVX1-NEXT: vmovd %ebp, %xmm0 -; AVX1-NEXT: shrl %r11d -; AVX1-NEXT: vpinsrb $1, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: shrl %ebx -; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: shrl %esi -; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 -; AVX1-NEXT: shrl %r14d -; AVX1-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 -; AVX1-NEXT: shrl %r15d -; AVX1-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0 -; AVX1-NEXT: shrl %r8d -; AVX1-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: shrl %edi -; AVX1-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; AVX1-NEXT: shrl %r9d -; AVX1-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: shrl %r10d -; AVX1-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: addq %r14, %r13 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm1, %r12 +; AVX1-NEXT: addq %rax, %r12 +; AVX1-NEXT: vmovq %xmm1, %r14 +; AVX1-NEXT: addq %rbp, %r14 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm7[0],zero,xmm7[1],zero +; AVX1-NEXT: vpextrq $1, %xmm1, %rbp +; AVX1-NEXT: addq %r11, %rbp +; AVX1-NEXT: vmovq %xmm1, %r11 +; AVX1-NEXT: addq %r15, %r11 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpextrq $1, %xmm0, %r15 +; AVX1-NEXT: addq %rbx, %r15 +; AVX1-NEXT: vmovq %xmm0, %rbx +; AVX1-NEXT: addq %rdx, %rbx +; AVX1-NEXT: vpextrq $1, %xmm6, %rax +; AVX1-NEXT: leaq -1(%rdi,%rax), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm6, %rax +; AVX1-NEXT: leaq -1(%r8,%rax), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrq $1, %xmm5, %rax +; AVX1-NEXT: leaq -1(%r9,%rax), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm5, %rax +; AVX1-NEXT: leaq -1(%r10,%rax), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX1-NEXT: leaq -1(%rcx,%rax), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX1-NEXT: leaq -1(%rcx,%rax), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrq $1, %xmm8, %rax +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: leaq -1(%rax,%rcx), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm8, %rax +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: leaq -1(%rax,%rcx), %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: xorl %r10d, %r10d +; AVX1-NEXT: addq $-1, %rsi +; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: adcq $-1, %rcx +; AVX1-NEXT: addq $-1, %r13 +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: addq $-1, %r12 +; AVX1-NEXT: movl $0, %edi +; AVX1-NEXT: adcq $-1, %rdi +; AVX1-NEXT: addq $-1, %r14 +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: adcq $-1, %rsi +; AVX1-NEXT: addq $-1, %rbp +; AVX1-NEXT: movl $0, %r9d +; AVX1-NEXT: adcq $-1, %r9 +; AVX1-NEXT: addq $-1, %r11 +; AVX1-NEXT: movl $0, %r8d +; AVX1-NEXT: adcq $-1, %r8 +; AVX1-NEXT: addq $-1, %r15 +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: adcq $-1, %rdx +; AVX1-NEXT: addq $-1, %rbx +; AVX1-NEXT: adcq $-1, %r10 +; AVX1-NEXT: shldq $63, %r11, %r8 +; AVX1-NEXT: shldq $63, %rbp, %r9 +; AVX1-NEXT: shldq $63, %r14, %rsi +; AVX1-NEXT: shldq $63, %r12, %rdi +; AVX1-NEXT: shldq $63, %r13, %rax +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbp, %rcx +; AVX1-NEXT: shldq $63, %rbx, %r10 +; AVX1-NEXT: shldq $63, %r15, %rdx +; AVX1-NEXT: vmovq %rcx, %xmm8 +; AVX1-NEXT: vmovq %rax, %xmm9 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm11 +; AVX1-NEXT: vmovq %rdi, %xmm12 +; AVX1-NEXT: vmovq %rsi, %xmm13 +; AVX1-NEXT: vmovq %rdx, %xmm14 +; AVX1-NEXT: vmovq %r10, %xmm15 +; AVX1-NEXT: vmovq %r9, %xmm10 +; AVX1-NEXT: vmovq %r8, %xmm1 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm5 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm6 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: vmovq %rax, %xmm7 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX1-NEXT: vpsllq $48, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 @@ -2154,123 +2279,230 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $16, %rsp ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpextrq $1, %xmm4, %rbx +; AVX2-NEXT: vmovq %xmm4, %rbp +; AVX2-NEXT: vpextrq $1, %xmm3, %rdi +; AVX2-NEXT: vmovq %xmm3, %rcx +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm3, %rdx +; AVX2-NEXT: vmovq %xmm3, %r9 +; AVX2-NEXT: vpextrq $1, %xmm2, %r13 +; AVX2-NEXT: vmovq %xmm2, %r12 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %r15 -; AVX2-NEXT: vmovq %xmm2, %r14 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm3, %r14 +; AVX2-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %r13 -; AVX2-NEXT: vmovq %xmm1, %r11 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm1, %r10 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: addq %rbx, %rax +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: vmovq %xmm4, %rsi +; AVX2-NEXT: addq %rbp, %rsi +; AVX2-NEXT: vpextrq $1, %xmm3, %rax +; AVX2-NEXT: addq %rdi, %rax +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: vmovq %xmm3, %r11 +; AVX2-NEXT: addq %rcx, %r11 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: vmovq %xmm3, %r8 +; AVX2-NEXT: addq %r9, %r8 +; AVX2-NEXT: vpextrq $1, %xmm2, %r9 +; AVX2-NEXT: addq %r13, %r9 +; AVX2-NEXT: vmovq %xmm2, %r15 +; AVX2-NEXT: addq %r12, %r15 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vmovd %xmm9, %r12d -; AVX2-NEXT: vpextrd $2, %xmm9, %r9d -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovd %xmm7, %ecx -; AVX2-NEXT: vpextrd $2, %xmm7, %edi -; AVX2-NEXT: vmovd %xmm5, %ebx -; AVX2-NEXT: vpextrd $2, %xmm5, %esi -; AVX2-NEXT: vmovd %xmm4, %edx -; AVX2-NEXT: vpextrd $2, %xmm4, %ebp -; AVX2-NEXT: vpextrd $2, %xmm1, %eax -; AVX2-NEXT: leal -1(%rbp,%rax), %eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: leal -1(%rdx,%rax), %eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrd $2, %xmm8, %eax -; AVX2-NEXT: leal -1(%rsi,%rax), %eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vmovd %xmm8, %eax -; AVX2-NEXT: leal -1(%rbx,%rax), %r10d -; AVX2-NEXT: vpextrd $2, %xmm6, %eax -; AVX2-NEXT: leal -1(%rdi,%rax), %r8d -; AVX2-NEXT: vmovd %xmm6, %eax -; AVX2-NEXT: leal -1(%rcx,%rax), %edi -; AVX2-NEXT: vpextrd $2, %xmm3, %eax -; AVX2-NEXT: leal -1(%r9,%rax), %r9d -; AVX2-NEXT: vmovd %xmm3, %ecx -; AVX2-NEXT: leal -1(%r12,%rcx), %r12d -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: leal -1(%r15,%rcx), %r15d -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: leal -1(%r14,%rcx), %r14d -; AVX2-NEXT: vpextrq $1, %xmm2, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leal -1(%rax,%rdx), %edx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm3, %rax +; AVX2-NEXT: addq %r14, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpextrq $1, %xmm0, %rbp +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; AVX2-NEXT: vmovq %xmm0, %r12 +; AVX2-NEXT: addq %r10, %r12 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %r10 +; AVX2-NEXT: addq %rax, %r10 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: addq $-1, %rbx +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: adcq $-1, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: adcq $-1, %rax +; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill +; AVX2-NEXT: addq $-1, %rdi +; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: adcq $-1, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r11 +; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: adcq $-1, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: adcq $-1, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: adcq $-1, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r9 +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: adcq $-1, %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: addq $-1, %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r15d +; AVX2-NEXT: adcq $-1, %r15 +; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movl $0, %r13d +; AVX2-NEXT: adcq $-1, %r13 +; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movl $0, %r14d +; AVX2-NEXT: adcq $-1, %r14 +; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: adcq $-1, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: addq $-1, %rax +; AVX2-NEXT: movl $0, %r11d +; AVX2-NEXT: adcq $-1, %r11 +; AVX2-NEXT: addq $-1, %rbp +; AVX2-NEXT: movl $0, %r9d +; AVX2-NEXT: adcq $-1, %r9 +; AVX2-NEXT: addq $-1, %r12 +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: adcq $-1, %r8 +; AVX2-NEXT: addq $-1, %r10 +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: adcq $-1, %rdi +; AVX2-NEXT: addq $-1, %rdx +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: adcq $-1, %rcx +; AVX2-NEXT: shldq $63, %rdx, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq $63, %r10, %rdi +; AVX2-NEXT: shldq $63, %r12, %r8 +; AVX2-NEXT: shldq $63, %rbp, %r9 +; AVX2-NEXT: shldq $63, %rax, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: shldq $63, %rdx, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: shldq $63, %rdx, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: shldq $63, %rdx, %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: shldq $63, %rax, %r15 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: shldq $63, %rax, %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: shldq $63, %rax, %rsi +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: shldq $63, %rax, %r12 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: leal -1(%rcx,%rax), %eax -; AVX2-NEXT: vpextrq $1, %xmm0, %rsi -; AVX2-NEXT: leal -1(%r13,%rsi), %esi -; AVX2-NEXT: vmovq %xmm0, %rbx -; AVX2-NEXT: leal -1(%r11,%rbx), %ebx -; AVX2-NEXT: vpextrq $1, %xmm10, %rcx -; AVX2-NEXT: vpextrq $1, %xmm11, %r13 -; AVX2-NEXT: leal -1(%rcx,%r13), %ecx -; AVX2-NEXT: vmovq %xmm10, %r13 -; AVX2-NEXT: vmovq %xmm11, %r11 -; AVX2-NEXT: leaq -1(%r13,%r11), %rbp -; AVX2-NEXT: shrq %rbp -; AVX2-NEXT: vmovd %ebp, %xmm0 -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrl %ebx -; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: shrl %esi -; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX2-NEXT: shrl %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: shrl %edx -; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r14d -; AVX2-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r15d -; AVX2-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r12d -; AVX2-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r9d -; AVX2-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %edi -; AVX2-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r8d -; AVX2-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r10d -; AVX2-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX2-NEXT: shrl %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX2-NEXT: shrl %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX2-NEXT: shrl %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: shldq $63, %rax, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: shldq $63, %rax, %r10 +; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: shldq $63, %rdx, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX2-NEXT: shldq $63, %rdx, %rbp +; AVX2-NEXT: vmovq %rbp, %xmm8 +; AVX2-NEXT: vmovq %rax, %xmm9 +; AVX2-NEXT: vmovq %r10, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vmovq %r12, %xmm12 +; AVX2-NEXT: vmovq %rsi, %xmm13 +; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 8-byte Folded Reload +; AVX2-NEXT: # xmm14 = mem[0],zero +; AVX2-NEXT: vmovq %r15, %xmm15 +; AVX2-NEXT: vmovq %r13, %xmm10 +; AVX2-NEXT: vmovq %r14, %xmm11 +; AVX2-NEXT: vmovq %rbx, %xmm2 +; AVX2-NEXT: vmovq %r11, %xmm3 +; AVX2-NEXT: vmovq %r9, %xmm4 +; AVX2-NEXT: vmovq %r8, %xmm5 +; AVX2-NEXT: vmovq %rdi, %xmm6 +; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload +; AVX2-NEXT: # xmm7 = mem[0],zero +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8 +; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: addq $16, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 ; AVX2-NEXT: popq %r13 @@ -2280,139 +2512,414 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: not_avg_v16i8_wide_constants: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrq $1, %xmm2, %r15 -; AVX512-NEXT: vmovq %xmm2, %r14 -; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512-NEXT: vpextrq $1, %xmm1, %r13 -; AVX512-NEXT: vmovq %xmm1, %r11 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm1 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vmovd %xmm9, %r12d -; AVX512-NEXT: vpextrd $2, %xmm9, %r9d -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512-NEXT: vmovd %xmm7, %ecx -; AVX512-NEXT: vpextrd $2, %xmm7, %edi -; AVX512-NEXT: vmovd %xmm5, %ebx -; AVX512-NEXT: vpextrd $2, %xmm5, %esi -; AVX512-NEXT: vmovd %xmm4, %edx -; AVX512-NEXT: vpextrd $2, %xmm4, %ebp -; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: leal -1(%rbp,%rax), %eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: leal -1(%rdx,%rax), %eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrd $2, %xmm8, %eax -; AVX512-NEXT: leal -1(%rsi,%rax), %eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vmovd %xmm8, %eax -; AVX512-NEXT: leal -1(%rbx,%rax), %r10d -; AVX512-NEXT: vpextrd $2, %xmm6, %eax -; AVX512-NEXT: leal -1(%rdi,%rax), %r8d -; AVX512-NEXT: vmovd %xmm6, %eax -; AVX512-NEXT: leal -1(%rcx,%rax), %edi -; AVX512-NEXT: vpextrd $2, %xmm3, %eax -; AVX512-NEXT: leal -1(%r9,%rax), %r9d -; AVX512-NEXT: vmovd %xmm3, %ecx -; AVX512-NEXT: leal -1(%r12,%rcx), %r12d -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: leal -1(%r15,%rcx), %r15d -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: leal -1(%r14,%rcx), %r14d -; AVX512-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: leal -1(%rax,%rdx), %edx -; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: leal -1(%rcx,%rax), %eax -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: leal -1(%r13,%rsi), %esi -; AVX512-NEXT: vmovq %xmm0, %rbx -; AVX512-NEXT: leal -1(%r11,%rbx), %ebx -; AVX512-NEXT: vpextrq $1, %xmm10, %rcx -; AVX512-NEXT: vpextrq $1, %xmm11, %r13 -; AVX512-NEXT: leal -1(%rcx,%r13), %ecx -; AVX512-NEXT: vmovq %xmm10, %r13 -; AVX512-NEXT: vmovq %xmm11, %r11 -; AVX512-NEXT: leaq -1(%r13,%r11), %rbp -; AVX512-NEXT: shrq %rbp -; AVX512-NEXT: vmovd %ebp, %xmm0 -; AVX512-NEXT: shrl %ecx -; AVX512-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: shrl %ebx -; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 -; AVX512-NEXT: shrl %esi -; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX512-NEXT: shrl %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX512-NEXT: shrl %edx -; AVX512-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r14d -; AVX512-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r15d -; AVX512-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r12d -; AVX512-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r9d -; AVX512-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %edi -; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r8d -; AVX512-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r10d -; AVX512-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX512-NEXT: shrl %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX512-NEXT: shrl %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX512-NEXT: shrl %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu %xmm0, (%rax) -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: not_avg_v16i8_wide_constants: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: pushq %r15 +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %r13 +; AVX512F-NEXT: pushq %r12 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512F-NEXT: vmovq %xmm5, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm4, %rax +; AVX512F-NEXT: vmovq %xmm4, %rbx +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rdi +; AVX512F-NEXT: vmovq %xmm4, %rsi +; AVX512F-NEXT: vpextrq $1, %xmm1, %r13 +; AVX512F-NEXT: vmovq %xmm1, %r15 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrq $1, %xmm2, %r12 +; AVX512F-NEXT: vmovq %xmm2, %r14 +; AVX512F-NEXT: vpextrq $1, %xmm1, %r11 +; AVX512F-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %r10 +; AVX512F-NEXT: vmovq %xmm1, %r9 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rbp +; AVX512F-NEXT: leal -1(%rdx,%rbp), %edx +; AVX512F-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-NEXT: vmovq %xmm5, %rbp +; AVX512F-NEXT: leal -1(%rcx,%rbp), %ecx +; AVX512F-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp +; AVX512F-NEXT: leal -1(%rax,%rbp), %eax +; AVX512F-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-NEXT: vmovq %xmm4, %rbp +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: leal -1(%rbx,%rbp), %r8d +; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp +; AVX512F-NEXT: leal -1(%rdi,%rbp), %edi +; AVX512F-NEXT: vmovq %xmm4, %rbp +; AVX512F-NEXT: leal -1(%rsi,%rbp), %esi +; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp +; AVX512F-NEXT: leal -1(%r13,%rbp), %r13d +; AVX512F-NEXT: vmovq %xmm3, %rbp +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: leal -1(%r15,%rbp), %r15d +; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp +; AVX512F-NEXT: leal -1(%r12,%rbp), %r12d +; AVX512F-NEXT: vmovq %xmm3, %rbp +; AVX512F-NEXT: leal -1(%r14,%rbp), %r14d +; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512F-NEXT: leal -1(%r11,%rdx), %r11d +; AVX512F-NEXT: vmovq %xmm2, %rbp +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: leal -1(%rax,%rbp), %ebp +; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx +; AVX512F-NEXT: leal -1(%r10,%rcx), %ecx +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: leal -1(%r9,%rax), %eax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm1, %r10 +; AVX512F-NEXT: leal -1(%rdx,%r10), %edx +; AVX512F-NEXT: vmovq %xmm0, %r10 +; AVX512F-NEXT: vmovq %xmm1, %r9 +; AVX512F-NEXT: leaq -1(%r10,%r9), %rbx +; AVX512F-NEXT: shrq %rbx +; AVX512F-NEXT: vmovd %ebx, %xmm0 +; AVX512F-NEXT: shrl %edx +; AVX512F-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %ecx +; AVX512F-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %ebp +; AVX512F-NEXT: vpinsrb $4, %ebp, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %r11d +; AVX512F-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %r14d +; AVX512F-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %r12d +; AVX512F-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %r15d +; AVX512F-NEXT: vpinsrb $8, %r15d, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %r13d +; AVX512F-NEXT: vpinsrb $9, %r13d, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %esi +; AVX512F-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %edi +; AVX512F-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 +; AVX512F-NEXT: shrl %r8d +; AVX512F-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0 +; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512F-NEXT: shrl %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512F-NEXT: shrl %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512F-NEXT: shrl %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r12 +; AVX512F-NEXT: popq %r13 +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: popq %r15 +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: not_avg_v16i8_wide_constants: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: subq $24, %rsp +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, %rbx +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rbp +; AVX512BW-NEXT: vmovq %xmm3, %rdi +; AVX512BW-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm3, %r15 +; AVX512BW-NEXT: vmovq %xmm2, %r8 +; AVX512BW-NEXT: vpextrq $1, %xmm2, %r14 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, %r9 +; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10 +; AVX512BW-NEXT: vmovq %xmm2, %r11 +; AVX512BW-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512BW-NEXT: vpextrq $1, %xmm2, %r13 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, %rax +; AVX512BW-NEXT: addq %rbx, %rax +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rax +; AVX512BW-NEXT: addq %rbp, %rax +; AVX512BW-NEXT: movq %rax, %rbp +; AVX512BW-NEXT: vmovq %xmm3, %rcx +; AVX512BW-NEXT: addq %rdi, %rcx +; AVX512BW-NEXT: vpextrq $1, %xmm3, %r12 +; AVX512BW-NEXT: addq %rsi, %r12 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, %rax +; AVX512BW-NEXT: addq %rdx, %rax +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax +; AVX512BW-NEXT: addq %r15, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: vmovq %xmm2, %rax +; AVX512BW-NEXT: addq %r8, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax +; AVX512BW-NEXT: addq %r14, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, %rax +; AVX512BW-NEXT: addq %r9, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax +; AVX512BW-NEXT: addq %r10, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: vmovq %xmm2, %rax +; AVX512BW-NEXT: addq %r11, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: vpextrq $1, %xmm2, %r14 +; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, %r10 +; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; AVX512BW-NEXT: vpextrq $1, %xmm2, %r9 +; AVX512BW-NEXT: addq %r13, %r9 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vmovq %xmm1, %r8 +; AVX512BW-NEXT: addq %rax, %r8 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdi +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512BW-NEXT: addq %rdi, %rsi +; AVX512BW-NEXT: addq $-1, %rbx +; AVX512BW-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movl $0, %r15d +; AVX512BW-NEXT: adcq $-1, %r15 +; AVX512BW-NEXT: addq $-1, %rbp +; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movl $0, %ebx +; AVX512BW-NEXT: adcq $-1, %rbx +; AVX512BW-NEXT: addq $-1, %rcx +; AVX512BW-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movl $0, %r11d +; AVX512BW-NEXT: adcq $-1, %r11 +; AVX512BW-NEXT: addq $-1, %r12 +; AVX512BW-NEXT: movq %r12, (%rsp) # 8-byte Spill +; AVX512BW-NEXT: movl $0, %edi +; AVX512BW-NEXT: adcq $-1, %rdi +; AVX512BW-NEXT: addq $-1, %rdx +; AVX512BW-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512BW-NEXT: movl $0, %r13d +; AVX512BW-NEXT: adcq $-1, %r13 +; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512BW-NEXT: movl $0, %r12d +; AVX512BW-NEXT: adcq $-1, %r12 +; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512BW-NEXT: addq $-1, %rcx +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: addq $-1, %r14 +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: addq $-1, %r10 +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: addq $-1, %r9 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: adcq $-1, %rdx +; AVX512BW-NEXT: addq $-1, %r8 +; AVX512BW-NEXT: movl $0, %eax +; AVX512BW-NEXT: adcq $-1, %rax +; AVX512BW-NEXT: addq $-1, %rsi +; AVX512BW-NEXT: movl $0, %ebp +; AVX512BW-NEXT: adcq $-1, %rbp +; AVX512BW-NEXT: shldq $63, %rsi, %rbp +; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: shldq $63, %r8, %rax +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: shldq $63, %r9, %rdx +; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %r10, %r8 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %r14, %r10 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rcx, %r9 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %r14 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %rsi +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %r12 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %r13 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %rdx +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %rcx +; AVX512BW-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %rdi +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %r11 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %rbx +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: shldq $63, %rax, %r15 +; AVX512BW-NEXT: vmovq %r15, %xmm0 +; AVX512BW-NEXT: vmovq %rbx, %xmm1 +; AVX512BW-NEXT: vmovq %r11, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovq %rdi, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovq %rcx, %xmm1 +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %r13, %xmm3 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vmovq %r12, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %rsi, %xmm1 +; AVX512BW-NEXT: vmovq %r14, %xmm2 +; AVX512BW-NEXT: vmovq %r9, %xmm3 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vmovq %r10, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %r8, %xmm1 +; AVX512BW-NEXT: vmovq %rbp, %xmm2 +; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload +; AVX512BW-NEXT: # xmm3 = mem[0],zero +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload +; AVX512BW-NEXT: # xmm2 = mem[0],zero +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: addq $24, %rsp +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i128> Index: llvm/test/CodeGen/X86/avx-cvt-2.ll =================================================================== --- llvm/test/CodeGen/X86/avx-cvt-2.ll +++ llvm/test/CodeGen/X86/avx-cvt-2.ll @@ -40,7 +40,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovq %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper Index: llvm/test/CodeGen/X86/avx-fp2int.ll =================================================================== --- llvm/test/CodeGen/X86/avx-fp2int.ll +++ llvm/test/CodeGen/X86/avx-fp2int.ll @@ -7,6 +7,7 @@ ; CHECK-LABEL: test1: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %c = fptoui <4 x double> %d to <4 x i8> @@ -16,6 +17,7 @@ ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %c = fptosi <4 x double> %d to <4 x i8> Index: llvm/test/CodeGen/X86/avx2-conversions.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-conversions.ll +++ llvm/test/CodeGen/X86/avx2-conversions.ll @@ -117,14 +117,12 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind { ; X32-LABEL: zext_8i8_8i32: ; X32: # %bb.0: -; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X32-NEXT: retl ; ; X64-LABEL: zext_8i8_8i32: ; X64: # %bb.0: -; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X64-NEXT: retq %B = zext <8 x i8> %A to <8 x i32> ret <8 x i32>%B Index: llvm/test/CodeGen/X86/avx2-masked-gather.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -9,23 +9,21 @@ define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 -; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i32: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 -; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; NOGATHER-LABEL: masked_gather_v2i32: @@ -36,16 +34,14 @@ ; NOGATHER-NEXT: je .LBB0_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load ; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB0_2: # %else ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB0_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB0_4: # %else2 ; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 ; NOGATHER-NEXT: retq @@ -58,11 +54,10 @@ define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32_concat: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl @@ -70,7 +65,6 @@ ; X64-LABEL: masked_gather_v2i32_concat: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovdqa (%rdi), %xmm2 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 @@ -85,18 +79,16 @@ ; NOGATHER-NEXT: je .LBB1_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load ; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB1_2: # %else ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB1_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1 +; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm1 ; NOGATHER-NEXT: .LBB1_4: # %else2 -; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: %ld = load <2 x i32*>, <2 x i32*>* %ptr @@ -676,10 +668,10 @@ define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) { ; X86-LABEL: masked_gather_v2i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vpgatherdq %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; @@ -721,10 +713,10 @@ define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) { ; X86-LABEL: masked_gather_v2double: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovsxdq (%eax), %xmm2 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vgatherdpd %xmm0, (,%xmm2), %xmm1 ; X86-NEXT: vmovapd %xmm1, %xmm0 ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -657,12 +657,12 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e4: ; X32: ## %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: _e4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> ; X64-NEXT: retq %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 Index: llvm/test/CodeGen/X86/avx512-any_extend_load.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-any_extend_load.ll +++ llvm/test/CodeGen/X86/avx512-any_extend_load.ll @@ -4,13 +4,25 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) { -; ALL-LABEL: any_extend_load_v8i64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqb %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; KNL-LABEL: any_extend_load_v8i64: +; KNL: # %bb.0: +; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; KNL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; KNL-NEXT: vpmovqb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: any_extend_load_v8i64: +; SKX: # %bb.0: +; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpmovqb %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1 %1 = zext <8 x i8> %wide.load to <8 x i64> %2 = add nuw nsw <8 x i64> %1, @@ -23,10 +35,12 @@ define void @any_extend_load_v8i32(<8 x i8> * %ptr) { ; KNL-LABEL: any_extend_load_v8i32: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: any_extend_load_v8i32: Index: llvm/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-cvt.ll +++ llvm/test/CodeGen/X86/avx512-cvt.ll @@ -513,15 +513,14 @@ ; NOVL-LABEL: f64to8uc: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdw %zmm0, %ymm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; NOVL-NEXT: vpmovdb %zmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8uc: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpmovdb %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> @@ -657,15 +656,14 @@ ; NOVL-LABEL: f64to8sc: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdw %zmm0, %ymm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; NOVL-NEXT: vpmovdb %zmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8sc: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpmovdb %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptosi <8 x double> %f to <8 x i8> @@ -1557,9 +1555,7 @@ define <8 x double> @scto8f64(<8 x i8> %a) { ; ALL-LABEL: scto8f64: ; ALL: # %bb.0: -; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; ALL-NEXT: vpslld $24, %ymm0, %ymm0 -; ALL-NEXT: vpsrad $24, %ymm0, %ymm0 +; ALL-NEXT: vpmovsxbd %xmm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %1 = sitofp <8 x i8> %a to <8 x double> @@ -1724,13 +1720,30 @@ } define <2 x double> @sbto2f64(<2 x double> %a) { -; ALL-LABEL: sbto2f64: -; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; ALL-NEXT: retq +; NOVL-LABEL: sbto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VLDQ-LABEL: sbto2f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto2f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLNODQ-NEXT: retq %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> ret <2 x double> %1 @@ -1749,8 +1762,7 @@ define <8 x double> @ucto8f64(<8 x i8> %a) { ; ALL-LABEL: ucto8f64: ; ALL: # %bb.0: -; ALL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <8 x i8> %a to <8 x double> @@ -1993,29 +2005,42 @@ } define <2 x float> @ubto2f32(<2 x i32> %a) { -; ALL-LABEL: ubto2f32: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { -; ALL-LABEL: ubto2f64: -; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; ALL-NEXT: retq +; NOVL-LABEL: ubto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 Index: llvm/test/CodeGen/X86/avx512-ext.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-ext.ll +++ llvm/test/CodeGen/X86/avx512-ext.ll @@ -2134,28 +2134,53 @@ } define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { -; ALL-LABEL: zext_4xi1_to_4x32: -; ALL: # %bb.0: -; ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; ALL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; ALL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpsrld $31, %xmm0, %xmm0 -; ALL-NEXT: retq +; KNL-LABEL: zext_4xi1_to_4x32: +; KNL: # %bb.0: +; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4xi1_to_4x32: +; SKX: # %bb.0: +; SKX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 +; SKX-NEXT: retq +; +; AVX512DQNOBW-LABEL: zext_4xi1_to_4x32: +; AVX512DQNOBW: # %bb.0: +; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512DQNOBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: retq %mask = icmp eq <4 x i8> %x, %y %1 = zext <4 x i1> %mask to <4 x i32> ret <4 x i32> %1 } define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { -; ALL-LABEL: zext_2xi1_to_2xi64: -; ALL: # %bb.0: -; ALL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; ALL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpsrlq $63, %xmm0, %xmm0 -; ALL-NEXT: retq +; KNL-LABEL: zext_2xi1_to_2xi64: +; KNL: # %bb.0: +; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_2xi1_to_2xi64: +; SKX: # %bb.0: +; SKX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; SKX-NEXT: vpmovm2q %k0, %xmm0 +; SKX-NEXT: vpsrlq $63, %xmm0, %xmm0 +; SKX-NEXT: retq +; +; AVX512DQNOBW-LABEL: zext_2xi1_to_2xi64: +; AVX512DQNOBW: # %bb.0: +; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQNOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: retq %mask = icmp eq <2 x i8> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> ret <2 x i64> %1 Index: llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -5478,19 +5478,19 @@ ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xe9] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) @@ -5515,7 +5515,7 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; X86-LABEL: test_mask_cmp_q_512: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1] ; X86-NEXT: vpcmpgtq %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x37,0xd0] @@ -5525,18 +5525,18 @@ ; X86-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9] ; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00] +; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] ; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] +; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] ; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] +; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] ; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] +; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] ; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] +; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] ; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -5551,18 +5551,18 @@ ; X64-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9] ; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) @@ -5597,19 +5597,19 @@ ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) @@ -5634,7 +5634,7 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; X86-LABEL: test_mask_ucmp_q_512: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1] ; X86-NEXT: vpcmpltuq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd1,0x01] @@ -5644,18 +5644,18 @@ ; X86-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06] ; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00] +; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] ; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] +; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] ; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] +; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] ; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] +; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] ; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] +; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] ; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -5670,18 +5670,18 @@ ; X64-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06] ; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) Index: llvm/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -2296,21 +2296,22 @@ ; KNL-LABEL: load_2i1: ; KNL: ## %bb.0: ; KNL-NEXT: kmovw (%rdi), %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: load_2i1: ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 -; SKX-NEXT: vpmovm2q %k0, %xmm0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: load_2i1: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2318,8 +2319,9 @@ ; AVX512DQ-LABEL: load_2i1: ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2327,7 +2329,7 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovb (%eax), %k0 -; X86-NEXT: vpmovm2q %k0, %xmm0 +; X86-NEXT: vpmovm2w %k0, %xmm0 ; X86-NEXT: retl %b = load <2 x i1>, <2 x i1>* %a %c = sext <2 x i1> %b to <2 x i16> @@ -2339,20 +2341,21 @@ ; KNL: ## %bb.0: ; KNL-NEXT: kmovw (%rdi), %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: load_4i1: ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 -; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: load_4i1: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2361,7 +2364,8 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2369,7 +2373,7 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovb (%eax), %k0 -; X86-NEXT: vpmovm2d %k0, %xmm0 +; X86-NEXT: vpmovm2w %k0, %xmm0 ; X86-NEXT: retl %b = load <4 x i1>, <4 x i1>* %a %c = sext <4 x i1> %b to <4 x i16> Index: llvm/test/CodeGen/X86/avx512-trunc.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-trunc.ll +++ llvm/test/CodeGen/X86/avx512-trunc.ll @@ -36,7 +36,7 @@ define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 { ; ALL-LABEL: trunc_qb_512: ; ALL: ## %bb.0: -; ALL-NEXT: vpmovqw %zmm0, %xmm0 +; ALL-NEXT: vpmovqb %zmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <8 x i64> %i to <8 x i8> @@ -58,14 +58,13 @@ ; KNL-LABEL: trunc_qb_256: ; KNL: ## %bb.0: ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vpmovqb %zmm0, %xmm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qb_256: ; SKX: ## %bb.0: -; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vpmovqb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i8> @@ -76,8 +75,7 @@ ; KNL-LABEL: trunc_qb_256_mem: ; KNL: ## %bb.0: ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpmovqb %zmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -95,6 +93,7 @@ define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 { ; ALL-LABEL: trunc_qb_128: ; ALL: ## %bb.0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; ALL-NEXT: retq %x = trunc <2 x i64> %i to <2 x i8> ret <2 x i8> %x @@ -141,14 +140,13 @@ ; KNL-LABEL: trunc_qw_256: ; KNL: ## %bb.0: ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qw_256: ; SKX: ## %bb.0: -; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vpmovqw %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i16> @@ -159,8 +157,7 @@ ; KNL-LABEL: trunc_qw_256_mem: ; KNL: ## %bb.0: ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -176,9 +173,16 @@ } define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 { -; ALL-LABEL: trunc_qw_128: -; ALL: ## %bb.0: -; ALL-NEXT: retq +; KNL-LABEL: trunc_qw_128: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qw_128: +; SKX: ## %bb.0: +; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; SKX-NEXT: retq %x = trunc <2 x i64> %i to <2 x i16> ret <2 x i16> %x } @@ -260,6 +264,7 @@ define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { ; ALL-LABEL: trunc_qd_128: ; ALL: ## %bb.0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; ALL-NEXT: retq %x = trunc <2 x i64> %i to <2 x i32> ret <2 x i32> %x @@ -306,14 +311,13 @@ ; KNL-LABEL: trunc_db_256: ; KNL: ## %bb.0: ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_db_256: ; SKX: ## %bb.0: -; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: vpmovdb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <8 x i32> %i to <8 x i8> @@ -324,8 +328,7 @@ ; KNL-LABEL: trunc_db_256_mem: ; KNL: ## %bb.0: ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -343,6 +346,7 @@ define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 { ; ALL-LABEL: trunc_db_128: ; ALL: ## %bb.0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; ALL-NEXT: retq %x = trunc <4 x i32> %i to <4 x i8> ret <4 x i8> %x @@ -513,6 +517,7 @@ define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 { ; ALL-LABEL: trunc_wb_128: ; ALL: ## %bb.0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; ALL-NEXT: retq %x = trunc <8 x i16> %i to <8 x i8> ret <8 x i8> %x @@ -691,6 +696,7 @@ ; ALL-LABEL: usat_trunc_wb_128: ; ALL: ## %bb.0: ; ALL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; ALL-NEXT: retq %x3 = icmp ult <8 x i16> %i, %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> @@ -716,16 +722,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] ; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: usat_trunc_db_256: ; SKX: ## %bb.0: ; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; SKX-NEXT: vpmovdw %ymm0, %xmm0 -; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpmovdb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %tmp1 = icmp ult <8 x i32> %x, Index: llvm/test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -886,22 +886,14 @@ define <4 x i32> @test44(<4 x i16> %x, <4 x i16> %y) #0 { ; AVX512-LABEL: test44: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; AVX512-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xaa] -; AVX512-NEXT: ## xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x76,0xc1] +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; AVX512-NEXT: vpmovsxwd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x23,0xc0] ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: test44: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; SKX-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xaa] -; SKX-NEXT: ## xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] -; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x76,0xc1] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1] +; SKX-NEXT: vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0] ; SKX-NEXT: retq ## encoding: [0xc3] %mask = icmp eq <4 x i16> %x, %y %1 = sext <4 x i1> %mask to <4 x i32> @@ -911,23 +903,17 @@ define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 { ; AVX512-LABEL: test45: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; AVX512-NEXT: vpblendw $17, %xmm1, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc9,0x11] -; AVX512-NEXT: ## xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512-NEXT: vpblendw $17, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc0,0x11] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x29,0xc1] -; AVX512-NEXT: vpsrlq $63, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x73,0xd0,0x3f] +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; AVX512-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI46_0-4, kind: reloc_riprel_4byte ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: test45: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; SKX-NEXT: vpblendw $17, %xmm1, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc9,0x11] -; SKX-NEXT: ## xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SKX-NEXT: vpblendw $17, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc0,0x11] -; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x29,0xc1] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1] +; SKX-NEXT: vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0] ; SKX-NEXT: vpsrlq $63, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xd0,0x3f] ; SKX-NEXT: retq ## encoding: [0xc3] %mask = icmp eq <2 x i16> %x, %y @@ -939,9 +925,9 @@ ; AVX512-LABEL: test46: ; AVX512: ## %bb.0: ; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00] -; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] -; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] -; AVX512-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A] +; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0] +; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI47_0-4, kind: reloc_riprel_4byte ; AVX512-NEXT: retq ## encoding: [0xc3] ; Index: llvm/test/CodeGen/X86/avx512-vec3-crash.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-vec3-crash.ll +++ llvm/test/CodeGen/X86/avx512-vec3-crash.ll @@ -6,19 +6,15 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovd %edi, %xmm0 -; CHECK-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %ecx, %xmm1 -; CHECK-NEXT: vpinsrd $1, %r8d, %xmm1, %xmm1 -; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 -; CHECK-NEXT: vpslld $24, %xmm1, %xmm1 -; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vpextrb $0, %xmm0, %eax -; CHECK-NEXT: vpextrb $4, %xmm0, %edx -; CHECK-NEXT: vpextrb $8, %xmm0, %ecx +; CHECK-NEXT: vpextrb $1, %xmm0, %edx +; CHECK-NEXT: vpextrb $2, %xmm0, %ecx ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: # kill: def $dl killed $dl killed $edx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx Index: llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -5133,19 +5133,19 @@ ; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x65,0xe9] ; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -5169,7 +5169,7 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; X86-LABEL: test_mask_cmp_w_128: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1] ; X86-NEXT: vpcmpgtw %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x65,0xd0] @@ -5179,18 +5179,18 @@ ; X86-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc9] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x00] +; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] +; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] ; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] +; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] ; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] +; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] ; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] +; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] ; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_cmp_w_128: @@ -5204,18 +5204,18 @@ ; X64-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc9] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -5249,19 +5249,19 @@ ; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe9,0x06] ; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -5285,7 +5285,7 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; X86-LABEL: test_mask_ucmp_w_128: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1] ; X86-NEXT: vpcmpltuw %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x01] @@ -5295,18 +5295,18 @@ ; X86-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x06] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x00] +; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] +; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] ; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] +; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] ; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] +; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] ; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] +; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] ; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_ucmp_w_128: @@ -5320,18 +5320,18 @@ ; X64-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x06] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 Index: llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -3326,6 +3326,8 @@ ; CHECK-LABEL: test_mm256_cvtepi64_epi8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} entry: @@ -3339,6 +3341,7 @@ ; CHECK-LABEL: test_mm256_cvtepi64_epi16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpmovqw %ymm0, %xmm0 +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} entry: @@ -3352,6 +3355,7 @@ ; CHECK-LABEL: test_mm256_cvtepi32_epi8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpmovdb %ymm0, %xmm0 +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} entry: Index: llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -8069,19 +8069,19 @@ ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x66,0xe9] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) @@ -8106,7 +8106,7 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; X86-LABEL: test_mask_cmp_d_256: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] ; X86-NEXT: vpcmpgtd %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x66,0xd0] @@ -8116,18 +8116,18 @@ ; X86-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc9] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x00] +; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] ; X86-NEXT: kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01] +; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] ; X86-NEXT: kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02] +; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] ; X86-NEXT: kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04] +; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] ; X86-NEXT: kmovw %k5, %ecx # encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05] +; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -8142,18 +8142,18 @@ ; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc9] ; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) @@ -8188,19 +8188,19 @@ ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) @@ -8225,7 +8225,7 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; X86-LABEL: test_mask_ucmp_d_256: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] ; X86-NEXT: vpcmpltud %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01] @@ -8235,18 +8235,18 @@ ; X86-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x06] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x00] +; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] ; X86-NEXT: kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01] +; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] ; X86-NEXT: kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02] +; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] ; X86-NEXT: kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04] +; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] ; X86-NEXT: kmovw %k5, %ecx # encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05] +; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -8261,18 +8261,18 @@ ; X64-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x06] ; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc7,0x07] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) @@ -8307,19 +8307,19 @@ ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf2,0xfd,0x28,0x37,0xe9] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) @@ -8358,17 +8358,17 @@ ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -8387,17 +8387,17 @@ ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) @@ -8432,19 +8432,19 @@ ; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) @@ -8482,19 +8482,19 @@ ; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X86-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -8512,19 +8512,19 @@ ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) @@ -8559,19 +8559,19 @@ ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x66,0xe9] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -8609,17 +8609,17 @@ ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_cmp_d_128: @@ -8637,17 +8637,17 @@ ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -8681,19 +8681,19 @@ ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -8730,19 +8730,19 @@ ; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X86-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_ucmp_d_128: @@ -8759,19 +8759,19 @@ ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -8805,19 +8805,19 @@ ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf2,0xfd,0x08,0x37,0xe9] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -8853,19 +8853,19 @@ ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_cmp_q_128: @@ -8881,19 +8881,19 @@ ; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] ; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -8927,19 +8927,19 @@ ; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; CHECK-NEXT: movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00] -; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -8975,19 +8975,19 @@ ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_ucmp_q_128: @@ -9003,19 +9003,19 @@ ; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] ; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00] +; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01] +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05] +; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] ; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 Index: llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll =================================================================== --- llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -178,144 +178,63 @@ } define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { -; SSE2-SSSE3-LABEL: v2i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $56, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $24, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-SSSE3-NEXT: psllq $56, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $24, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-SSSE3-NEXT: psllq $56, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $24, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-SSSE3-NEXT: psllq $56, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $24, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax -; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax -; SSE2-SSSE3-NEXT: retq -; -; AVX1-LABEL: v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm3, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX1-NEXT: vpsrad $24, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm3, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX2-NEXT: vpsrad $24, %xmm3, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX2-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; SSE2-LABEL: v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movmskpd %xmm2, %eax +; SSSE3-NEXT: # kill: def $al killed $al killed $eax +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v2i8: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxbq %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $56, %xmm3, %xmm3 -; AVX512F-NEXT: vpsraq $56, %xmm3, %xmm3 -; AVX512F-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX512F-NEXT: vpsraq $56, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $56, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $56, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsraq $56, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsllq $56, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsraq $56, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $56, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtb %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -329,142 +248,47 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; SSE2-SSSE3-LABEL: v2i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $48, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $16, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-SSSE3-NEXT: psllq $48, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $16, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-SSSE3-NEXT: psllq $48, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-SSSE3-NEXT: psllq $48, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: psrad $31, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $16, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax +; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm3, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX1-NEXT: vpsrad $16, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $48, %xmm3, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX2-NEXT: vpsrad $16, %xmm3, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX2-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i16: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $48, %xmm3, %xmm3 -; AVX512F-NEXT: vpsraq $48, %xmm3, %xmm3 -; AVX512F-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX512F-NEXT: vpsraq $48, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $48, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $48, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsraq $48, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsraq $48, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $48, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -478,118 +302,40 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movmskpd %xmm2, %eax +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm4 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm3, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsraq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsraq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -700,66 +446,47 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; SSE2-SSSE3-LABEL: v4i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pslld $24, %xmm3 -; SSE2-SSSE3-NEXT: psrad $24, %xmm3 -; SSE2-SSSE3-NEXT: pslld $24, %xmm2 -; SSE2-SSSE3-NEXT: psrad $24, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pslld $24, %xmm1 -; SSE2-SSSE3-NEXT: psrad $24, %xmm1 -; SSE2-SSSE3-NEXT: pslld $24, %xmm0 -; SSE2-SSSE3-NEXT: psrad $24, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3 -; AVX12-NEXT: vpslld $24, %xmm2, %xmm2 -; AVX12-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: retq ; ; AVX512F-LABEL: v4i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512F-NEXT: vpsrad $24, %xmm3, %xmm3 -; AVX512F-NEXT: vpslld $24, %xmm2, %xmm2 -; AVX512F-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX512F-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX512F-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX512F-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsrad $24, %xmm3, %xmm3 -; AVX512BW-NEXT: vpslld $24, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX512BW-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX512BW-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtb %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -773,66 +500,45 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; SSE2-SSSE3-LABEL: v4i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pslld $16, %xmm3 -; SSE2-SSSE3-NEXT: psrad $16, %xmm3 -; SSE2-SSSE3-NEXT: pslld $16, %xmm2 -; SSE2-SSSE3-NEXT: psrad $16, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pslld $16, %xmm1 -; SSE2-SSSE3-NEXT: psrad $16, %xmm1 -; SSE2-SSSE3-NEXT: pslld $16, %xmm0 -; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i16: ; AVX12: # %bb.0: -; AVX12-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3 -; AVX12-NEXT: vpslld $16, %xmm2, %xmm2 -; AVX12-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: retq ; ; AVX512F-LABEL: v4i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX512F-NEXT: vpsrad $16, %xmm3, %xmm3 -; AVX512F-NEXT: vpslld $16, %xmm2, %xmm2 -; AVX512F-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX512F-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX512F-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512F-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} +; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsrad $16, %xmm3, %xmm3 -; AVX512BW-NEXT: vpslld $16, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -846,35 +552,23 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; SSE2-SSSE3-LABEL: v8i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm3 -; SSE2-SSSE3-NEXT: psraw $8, %xmm3 -; SSE2-SSSE3-NEXT: psllw $8, %xmm2 -; SSE2-SSSE3-NEXT: psraw $8, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3 -; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX12-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 -; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1 +; AVX12-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -882,19 +576,13 @@ ; ; AVX512F-LABEL: v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX512F-NEXT: vpsraw $8, %xmm3, %xmm3 -; AVX512F-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX512F-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: vzeroupper @@ -902,16 +590,9 @@ ; ; AVX512BW-LABEL: v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsraw $8, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1} +; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtb %xmm3, %xmm2, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -144,87 +144,45 @@ } define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { -; SSE2-SSSE3-LABEL: v2i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $56, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $24, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-SSSE3-NEXT: psllq $56, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $24, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax -; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax -; SSE2-SSSE3-NEXT: retq -; -; AVX1-LABEL: v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; SSE2-LABEL: v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,0,0,u,u,0,0,u,u,1,1,u,u,1,1] +; SSSE3-NEXT: movmskpd %xmm0, %eax +; SSSE3-NEXT: # kill: def $al killed $al killed $eax +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v2i8: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $56, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $56, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -236,85 +194,34 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; SSE2-SSSE3-LABEL: v2i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $48, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-SSSE3-NEXT: psllq $48, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: psrad $31, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $16, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i16: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $48, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $48, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -326,73 +233,30 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-LABEL: v2i32: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $32, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-SSSE3-NEXT: psllq $32, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: psrad $31, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX12-LABEL: v2i32: +; AVX12: # %bb.0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: # kill: def $al killed $al killed $eax +; AVX12-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -478,44 +342,34 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; SSE2-SSSE3-LABEL: v4i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pslld $24, %xmm1 -; SSE2-SSSE3-NEXT: psrad $24, %xmm1 -; SSE2-SSSE3-NEXT: pslld $24, %xmm0 -; SSE2-SSSE3-NEXT: psrad $24, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: retq ; ; AVX512F-LABEL: v4i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX512F-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX512F-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX512BW-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -527,44 +381,33 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; SSE2-SSSE3-LABEL: v4i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pslld $16, %xmm1 -; SSE2-SSSE3-NEXT: psrad $16, %xmm1 -; SSE2-SSSE3-NEXT: pslld $16, %xmm0 -; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i16: ; AVX12: # %bb.0: -; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: retq ; ; AVX512F-LABEL: v4i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX512F-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512F-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq @@ -576,11 +419,8 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; SSE2-SSSE3-LABEL: v8i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax @@ -588,11 +428,8 @@ ; ; AVX12-LABEL: v8i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -600,13 +437,9 @@ ; ; AVX512F-LABEL: v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512F-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: vzeroupper @@ -614,11 +447,7 @@ ; ; AVX512BW-LABEL: v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/bitcast-vector-bool.ll =================================================================== --- llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -151,27 +151,14 @@ } define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { -; SSE2-LABEL: bitcast_v16i8_to_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: retq -; -; SSSE3-LABEL: bitcast_v16i8_to_v2i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v16i8_to_v2i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: bitcast_v16i8_to_v2i8: ; AVX12: # %bb.0: @@ -187,7 +174,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovb2m %xmm0, %k0 ; AVX512-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %ecx ; AVX512-NEXT: vpextrb $1, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al @@ -318,29 +305,15 @@ } define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { -; SSE2-LABEL: bitcast_v16i16_to_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: packsswb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: retq -; -; SSSE3-LABEL: bitcast_v16i16_to_v2i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: packsswb %xmm1, %xmm0 -; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v16i16_to_v2i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: bitcast_v16i16_to_v2i8: ; AVX1: # %bb.0: @@ -374,7 +347,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovw2m %ymm0, %k0 ; AVX512-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %ecx ; AVX512-NEXT: vpextrb $1, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al @@ -392,12 +365,10 @@ define i16 @bitcast_v32i8_to_v2i16(<32 x i8> %a0) nounwind { ; SSE2-SSSE3-LABEL: bitcast_v32i8_to_v2i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %ecx -; SSE2-SSSE3-NEXT: shll $16, %ecx -; SSE2-SSSE3-NEXT: orl %eax, %ecx -; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $0, %xmm0, %ecx +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %ecx +; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax +; SSE2-SSSE3-NEXT: shll $16, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 ; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax ; SSE2-SSSE3-NEXT: addl %ecx, %eax ; SSE2-SSSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -411,7 +382,6 @@ ; AVX1-NEXT: shll $16, %ecx ; AVX1-NEXT: orl %eax, %ecx ; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, %ecx ; AVX1-NEXT: vpextrw $1, %xmm0, %eax ; AVX1-NEXT: addl %ecx, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax @@ -420,9 +390,8 @@ ; ; AVX2-LABEL: bitcast_v32i8_to_v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, %ecx +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: vpextrw $1, %xmm0, %eax ; AVX2-NEXT: addl %ecx, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax @@ -437,8 +406,8 @@ ; AVX512-NEXT: subq $32, %rsp ; AVX512-NEXT: vpmovb2m %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, (%rsp) -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: vmovd %xmm0, %ecx ; AVX512-NEXT: vpextrw $1, %xmm0, %eax ; AVX512-NEXT: addl %ecx, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax @@ -579,33 +548,17 @@ } define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { -; SSE2-LABEL: bitcast_v16i32_to_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: packsswb %xmm2, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: retq -; -; SSSE3-LABEL: bitcast_v16i32_to_v2i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: packsswb %xmm2, %xmm0 -; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v16i32_to_v2i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: bitcast_v16i32_to_v2i8: ; AVX1: # %bb.0: @@ -646,7 +599,7 @@ ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ; AVX512-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %ecx ; AVX512-NEXT: vpextrb $1, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al @@ -665,13 +618,11 @@ ; SSE2-SSSE3-LABEL: bitcast_v32i16_to_v2i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %ecx ; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %ecx -; SSE2-SSSE3-NEXT: shll $16, %ecx -; SSE2-SSSE3-NEXT: orl %eax, %ecx -; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $0, %xmm0, %ecx +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: shll $16, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 ; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax ; SSE2-SSSE3-NEXT: addl %ecx, %eax ; SSE2-SSSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -688,7 +639,6 @@ ; AVX1-NEXT: shll $16, %ecx ; AVX1-NEXT: orl %eax, %ecx ; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, %ecx ; AVX1-NEXT: vpextrw $1, %xmm0, %eax ; AVX1-NEXT: addl %ecx, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax @@ -699,9 +649,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, %ecx +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: vpextrw $1, %xmm0, %eax ; AVX2-NEXT: addl %ecx, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax @@ -716,8 +665,8 @@ ; AVX512-NEXT: subq $32, %rsp ; AVX512-NEXT: vpmovw2m %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, (%rsp) -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: vmovd %xmm0, %ecx ; AVX512-NEXT: vpextrw $1, %xmm0, %eax ; AVX512-NEXT: addl %ecx, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax @@ -984,9 +933,9 @@ ; SSE2-SSSE3-NEXT: orl %ecx, %edx ; SSE2-SSSE3-NEXT: orl %eax, %edx ; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-SSSE3-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 ; SSE2-SSSE3-NEXT: movd %xmm0, %ecx -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax ; SSE2-SSSE3-NEXT: addl %ecx, %eax ; SSE2-SSSE3-NEXT: retq @@ -1246,7 +1195,7 @@ ; AVX1-NEXT: orl %ecx, %edx ; AVX1-NEXT: orl %eax, %edx ; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX1-NEXT: vmovd %xmm0, %ecx ; AVX1-NEXT: vpextrd $1, %xmm0, %eax ; AVX1-NEXT: addl %ecx, %eax @@ -1506,7 +1455,7 @@ ; AVX2-NEXT: orl %ecx, %edx ; AVX2-NEXT: orl %eax, %edx ; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX2-NEXT: vmovd %xmm0, %ecx ; AVX2-NEXT: vpextrd $1, %xmm0, %eax ; AVX2-NEXT: addl %ecx, %eax @@ -1517,7 +1466,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovb2m %zmm0, %k0 ; AVX512-NEXT: kmovq %k0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX512-NEXT: vmovd %xmm0, %ecx ; AVX512-NEXT: vpextrd $1, %xmm0, %eax ; AVX512-NEXT: addl %ecx, %eax Index: llvm/test/CodeGen/X86/bitreverse.ll =================================================================== --- llvm/test/CodeGen/X86/bitreverse.ll +++ llvm/test/CodeGen/X86/bitreverse.ll @@ -55,13 +55,11 @@ ; X64-NEXT: pxor %xmm1, %xmm1 ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] ; X64-NEXT: packuswb %xmm2, %xmm0 ; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psllw $4, %xmm1 @@ -81,7 +79,6 @@ ; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: psrlw $1, %xmm0 ; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: psrlq $48, %xmm0 ; X64-NEXT: retq %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a) ret <2 x i16> %b @@ -410,7 +407,7 @@ ; ; X64-LABEL: fold_v2i16: ; X64: # %bb.0: -; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240] +; X64-NEXT: movaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u> ; X64-NEXT: retq %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> ) ret <2 x i16> %b Index: llvm/test/CodeGen/X86/bswap-vector.ll =================================================================== --- llvm/test/CodeGen/X86/bswap-vector.ll +++ llvm/test/CodeGen/X86/bswap-vector.ll @@ -291,23 +291,22 @@ ; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1 ; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] ; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] ; CHECK-NOSSSE3-NEXT: packuswb %xmm2, %xmm0 -; CHECK-NOSSSE3-NEXT: psrld $16, %xmm0 ; CHECK-NOSSSE3-NEXT: retq ; ; CHECK-SSSE3-LABEL: test7: ; CHECK-SSSE3: # %bb.0: # %entry -; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0],zero,zero,xmm0[5,4],zero,zero,xmm0[9,8],zero,zero,xmm0[13,12],zero,zero +; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; CHECK-SSSE3-NEXT: retq ; ; CHECK-AVX-LABEL: test7: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0],zero,zero,xmm0[5,4],zero,zero,xmm0[9,8],zero,zero,xmm0[13,12],zero,zero +; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; CHECK-AVX-NEXT: retq ; ; CHECK-WIDE-AVX-LABEL: test7: Index: llvm/test/CodeGen/X86/buildvec-insertvec.ll =================================================================== --- llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -6,22 +6,29 @@ ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: shll $8, %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00 +; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: pinsrw $1, %eax, %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: foo: ; SSE41: # %bb.0: ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 +; SSE41-NEXT: pinsrb $2, %eax, %xmm0 ; SSE41-NEXT: movl $255, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 ; SSE41-NEXT: movd %xmm0, (%rdi) ; SSE41-NEXT: retq %t0 = fptoui <3 x float> %in to <3 x i8> Index: llvm/test/CodeGen/X86/combine-64bit-vec-binop.ll =================================================================== --- llvm/test/CodeGen/X86/combine-64bit-vec-binop.ll +++ llvm/test/CodeGen/X86/combine-64bit-vec-binop.ll @@ -101,9 +101,9 @@ define double @test3_mul(double %A, double %B) { ; SSE41-LABEL: test3_mul: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSE41-NEXT: retq %1 = bitcast double %A to <8 x i8> Index: llvm/test/CodeGen/X86/combine-or.ll =================================================================== --- llvm/test/CodeGen/X86/combine-or.ll +++ llvm/test/CodeGen/X86/combine-or.ll @@ -362,7 +362,7 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) { ; CHECK-LABEL: test_crash: ; CHECK: # %bb.0: -; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32> Index: llvm/test/CodeGen/X86/cvtv2f32.ll =================================================================== --- llvm/test/CodeGen/X86/cvtv2f32.ll +++ llvm/test/CodeGen/X86/cvtv2f32.ll @@ -42,11 +42,9 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_buildvector_cvt: ; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X32-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; X32-NEXT: orpd %xmm1, %xmm2 +; X32-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X32-NEXT: por %xmm1, %xmm2 ; X32-NEXT: subpd %xmm1, %xmm2 ; X32-NEXT: cvtpd2ps %xmm2, %xmm1 ; X32-NEXT: mulps %xmm1, %xmm0 @@ -54,13 +52,13 @@ ; ; X64-LABEL: uitofp_2i32_buildvector_cvt: ; X64: # %bb.0: -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: movd %edi, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; X64-NEXT: por %xmm1, %xmm2 -; X64-NEXT: subpd %xmm1, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm1 +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: pinsrd $1, %esi, %xmm1 +; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: subpd %xmm2, %xmm1 +; X64-NEXT: cvtpd2ps %xmm1, %xmm1 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = insertelement <2 x i32> undef, i32 %x, i32 0 @@ -73,23 +71,21 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_legalized: ; X32: # %bb.0: -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X32-NEXT: movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15] -; X32-NEXT: orps %xmm0, %xmm2 -; X32-NEXT: subpd %xmm0, %xmm2 -; X32-NEXT: cvtpd2ps %xmm2, %xmm0 +; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X32-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; X32-NEXT: por %xmm2, %xmm0 +; X32-NEXT: subpd %xmm2, %xmm0 +; X32-NEXT: cvtpd2ps %xmm0, %xmm0 ; X32-NEXT: mulps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: uitofp_2i32_legalized: ; X64: # %bb.0: -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; X64-NEXT: movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15] -; X64-NEXT: orps %xmm0, %xmm2 -; X64-NEXT: subpd %xmm0, %xmm2 -; X64-NEXT: cvtpd2ps %xmm2, %xmm0 +; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: subpd %xmm2, %xmm0 +; X64-NEXT: cvtpd2ps %xmm0, %xmm0 ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = uitofp <2 x i32> %in to <2 x float> Index: llvm/test/CodeGen/X86/extract-concat.ll =================================================================== --- llvm/test/CodeGen/X86/extract-concat.ll +++ llvm/test/CodeGen/X86/extract-concat.ll @@ -5,9 +5,14 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pextrb $8, %xmm0, %eax +; CHECK-NEXT: pextrb $4, %xmm0, %ecx +; CHECK-NEXT: pextrb $0, %xmm0, %edx +; CHECK-NEXT: movd %edx, %xmm0 +; CHECK-NEXT: pinsrb $1, %ecx, %xmm0 +; CHECK-NEXT: pinsrb $2, %eax, %xmm0 ; CHECK-NEXT: movl $255, %eax -; CHECK-NEXT: pinsrd $3, %eax, %xmm0 -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: pinsrb $3, %eax, %xmm0 ; CHECK-NEXT: movd %xmm0, (%rdi) ; CHECK-NEXT: retq %t0 = fptosi <4 x float> %in to <4 x i32> Index: llvm/test/CodeGen/X86/extract-insert.ll =================================================================== --- llvm/test/CodeGen/X86/extract-insert.ll +++ llvm/test/CodeGen/X86/extract-insert.ll @@ -31,12 +31,10 @@ define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind { ; X86-LABEL: extractelt_bitcast_extra_use: ; X86: # %bb.0: -; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: extractelt_bitcast_extra_use: Index: llvm/test/CodeGen/X86/f16c-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/f16c-intrinsics.ll +++ llvm/test/CodeGen/X86/f16c-intrinsics.ll @@ -268,18 +268,12 @@ ; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m: ; X32-AVX512VL: # %bb.0: # %entry ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03] -; X32-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] -; X32-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X32-AVX512VL-NEXT: vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00] +; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m: ; X64-AVX512VL: # %bb.0: # %entry -; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03] -; X64-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] -; X64-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX512VL-NEXT: vpmovdw %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07] +; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3) Index: llvm/test/CodeGen/X86/fold-vector-sext-zext.ll =================================================================== --- llvm/test/CodeGen/X86/fold-vector-sext-zext.ll +++ llvm/test/CodeGen/X86/fold-vector-sext-zext.ll @@ -11,12 +11,12 @@ define <4 x i16> @test_sext_4i8_4i16() { ; X32-LABEL: test_sext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -29,12 +29,12 @@ define <4 x i16> @test_sext_4i8_4i16_undef() { ; X32-LABEL: test_sext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = +; X32-NEXT: vmovaps {{.*#+}} xmm0 = ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = +; X64-NEXT: vmovaps {{.*#+}} xmm0 = ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -207,12 +207,12 @@ define <4 x i16> @test_zext_4i8_4i16() { ; X32-LABEL: test_zext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -261,12 +261,12 @@ define <4 x i16> @test_zext_4i8_4i16_undef() { ; X32-LABEL: test_zext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 Index: llvm/test/CodeGen/X86/insertelement-shuffle.ll =================================================================== --- llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -30,18 +30,10 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X86_AVX256-LABEL: insert_subvector_512: ; X86_AVX256: # %bb.0: -; X86_AVX256-NEXT: pushl %ebp -; X86_AVX256-NEXT: movl %esp, %ebp -; X86_AVX256-NEXT: andl $-8, %esp -; X86_AVX256-NEXT: subl $8, %esp -; X86_AVX256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86_AVX256-NEXT: vmovlps %xmm2, (%esp) ; X86_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X86_AVX256-NEXT: vpinsrd $0, (%esp), %xmm2, %xmm2 +; X86_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X86_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; X86_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X86_AVX256-NEXT: movl %ebp, %esp -; X86_AVX256-NEXT: popl %ebp ; X86_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: Index: llvm/test/CodeGen/X86/known-bits-vector.ll =================================================================== --- llvm/test/CodeGen/X86/known-bits-vector.ll +++ llvm/test/CodeGen/X86/known-bits-vector.ll @@ -76,15 +76,15 @@ ; X32-LABEL: knownbits_mask_shuffle_sext: ; X32: # %bb.0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_sext: ; X64: # %bb.0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -96,15 +96,15 @@ ; X32-LABEL: knownbits_mask_shuffle_shuffle_sext: ; X32: # %bb.0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_shuffle_sext: ; X64: # %bb.0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> Index: llvm/test/CodeGen/X86/known-bits.ll =================================================================== --- llvm/test/CodeGen/X86/known-bits.ll +++ llvm/test/CodeGen/X86/known-bits.ll @@ -5,100 +5,44 @@ define void @knownbits_zext_in_reg(i8*) nounwind { ; X32-LABEL: knownbits_zext_in_reg: ; X32: # %bb.0: # %BB -; X32-NEXT: pushl %ebp ; X32-NEXT: pushl %ebx -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %esi -; X32-NEXT: subl $16, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movzbl (%eax), %ecx ; X32-NEXT: imull $101, %ecx, %eax ; X32-NEXT: shrl $14, %eax -; X32-NEXT: imull $177, %ecx, %ecx -; X32-NEXT: shrl $14, %ecx -; X32-NEXT: movzbl %al, %eax -; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 -; X32-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-NEXT: vpand %xmm2, %xmm0, %xmm0 -; X32-NEXT: vpextrd $1, %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: xorl %ecx, %ecx -; X32-NEXT: vmovd %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: vmovd %xmm0, (%esp) # 4-byte Folded Spill -; X32-NEXT: vpextrd $2, %xmm1, %edi -; X32-NEXT: vpextrd $2, %xmm0, %esi -; X32-NEXT: vpextrd $3, %xmm1, %ebx -; X32-NEXT: vpextrd $3, %xmm0, %ebp +; X32-NEXT: imull $177, %ecx, %edx +; X32-NEXT: shrl $14, %edx +; X32-NEXT: movzbl %al, %ecx +; X32-NEXT: xorl %ebx, %ebx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB0_1: # %CF ; X32-NEXT: # =>This Loop Header: Depth=1 ; X32-NEXT: # Child Loop BB0_2 Depth 2 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: divl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: divl (%esp) # 4-byte Folded Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: divl %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: divl %ebp +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: divb %dl ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB0_2: # %CF237 ; X32-NEXT: # Parent Loop BB0_1 Depth=1 ; X32-NEXT: # => This Inner Loop Header: Depth=2 -; X32-NEXT: testb %cl, %cl +; X32-NEXT: testb %bl, %bl ; X32-NEXT: jne .LBB0_2 ; X32-NEXT: jmp .LBB0_1 ; ; X64-LABEL: knownbits_zext_in_reg: ; X64: # %bb.0: # %BB -; X64-NEXT: pushq %rbp -; X64-NEXT: pushq %rbx ; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: imull $101, %eax, %ecx ; X64-NEXT: shrl $14, %ecx -; X64-NEXT: imull $177, %eax, %eax -; X64-NEXT: shrl $14, %eax +; X64-NEXT: imull $177, %eax, %edx +; X64-NEXT: shrl $14, %edx ; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X64-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 -; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vpand %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpextrd $1, %xmm1, %r8d -; X64-NEXT: vpextrd $1, %xmm0, %r9d ; X64-NEXT: xorl %esi, %esi -; X64-NEXT: vmovd %xmm1, %r10d -; X64-NEXT: vmovd %xmm0, %r11d -; X64-NEXT: vpextrd $2, %xmm1, %edi -; X64-NEXT: vpextrd $2, %xmm0, %ebx -; X64-NEXT: vpextrd $3, %xmm1, %ecx -; X64-NEXT: vpextrd $3, %xmm0, %ebp ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_1: # %CF ; X64-NEXT: # =>This Loop Header: Depth=1 ; X64-NEXT: # Child Loop BB0_2 Depth 2 -; X64-NEXT: movl %r8d, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %r9d -; X64-NEXT: movl %r10d, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %r11d -; X64-NEXT: movl %edi, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ebx ; X64-NEXT: movl %ecx, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ebp +; X64-NEXT: divb %dl ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_2: # %CF237 ; X64-NEXT: # Parent Loop BB0_1 Depth=1 Index: llvm/test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- llvm/test/CodeGen/X86/known-signbits-vector.ll +++ llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -280,8 +280,7 @@ ; X64-NEXT: vpand %xmm1, %xmm0, %xmm2 ; X64-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0 +; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> Index: llvm/test/CodeGen/X86/lower-bitcast.ll =================================================================== --- llvm/test/CodeGen/X86/lower-bitcast.ll +++ llvm/test/CodeGen/X86/lower-bitcast.ll @@ -9,9 +9,7 @@ define double @test1(double %A) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test1: @@ -68,9 +66,7 @@ ; CHECK-LABEL: test4: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; @@ -108,9 +104,7 @@ define double @test6(double %A) { ; CHECK-LABEL: test6: ; CHECK: # %bb.0: -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test6: @@ -147,9 +141,7 @@ define double @test8(double %A) { ; CHECK-LABEL: test8: ; CHECK: # %bb.0: -; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test8: Index: llvm/test/CodeGen/X86/madd.ll =================================================================== --- llvm/test/CodeGen/X86/madd.ll +++ llvm/test/CodeGen/X86/madd.ll @@ -1876,26 +1876,12 @@ ; ; AVX1-LABEL: larger_mul: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: larger_mul: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2597,29 +2583,29 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pmulhw %xmm2, %xmm4 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmulhw %xmm1, %xmm3 ; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: pmaddwd_bad_indices: @@ -2627,13 +2613,13 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rsi), %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,4,5,10,11,12,13,12,13,10,11,12,13,14,15] -; AVX-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,8,9,14,15,12,13,14,15] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX-NEXT: vpmovsxwd %xmm2, %xmm2 ; AVX-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX-NEXT: vpmulld %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,8,9,14,15,12,13,14,15] ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] ; AVX-NEXT: vpmovsxwd %xmm1, %xmm1 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 Index: llvm/test/CodeGen/X86/masked_compressstore.ll =================================================================== --- llvm/test/CodeGen/X86/masked_compressstore.ll +++ llvm/test/CodeGen/X86/masked_compressstore.ll @@ -669,19 +669,16 @@ define void @compressstore_v2f32_v2i32(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SSE2-LABEL: compressstore_v2f32_v2i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: LBB2_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 @@ -693,16 +690,15 @@ ; SSE42-LABEL: compressstore_v2f32_v2i32: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE42-NEXT: pcmpeqq %xmm2, %xmm1 -; SSE42-NEXT: pextrb $0, %xmm1, %eax +; SSE42-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE42-NEXT: pextrb $0, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_2 ; SSE42-NEXT: ## %bb.1: ## %cond.store ; SSE42-NEXT: movss %xmm0, (%rdi) ; SSE42-NEXT: addq $4, %rdi ; SSE42-NEXT: LBB2_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm1, %eax +; SSE42-NEXT: pextrb $4, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_4 ; SSE42-NEXT: ## %bb.3: ## %cond.store1 @@ -710,65 +706,51 @@ ; SSE42-NEXT: LBB2_4: ## %else2 ; SSE42-NEXT: retq ; -; AVX1-LABEL: compressstore_v2f32_v2i32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $0, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB2_2 -; AVX1-NEXT: ## %bb.1: ## %cond.store -; AVX1-NEXT: vmovss %xmm0, (%rdi) -; AVX1-NEXT: addq $4, %rdi -; AVX1-NEXT: LBB2_2: ## %else -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB2_4 -; AVX1-NEXT: ## %bb.3: ## %cond.store1 -; AVX1-NEXT: vextractps $1, %xmm0, (%rdi) -; AVX1-NEXT: LBB2_4: ## %else2 -; AVX1-NEXT: retq -; -; AVX2-LABEL: compressstore_v2f32_v2i32: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB2_2 -; AVX2-NEXT: ## %bb.1: ## %cond.store -; AVX2-NEXT: vmovss %xmm0, (%rdi) -; AVX2-NEXT: addq $4, %rdi -; AVX2-NEXT: LBB2_2: ## %else -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB2_4 -; AVX2-NEXT: ## %bb.3: ## %cond.store1 -; AVX2-NEXT: vextractps $1, %xmm0, (%rdi) -; AVX2-NEXT: LBB2_4: ## %else2 -; AVX2-NEXT: retq +; AVX1OR2-LABEL: compressstore_v2f32_v2i32: +; AVX1OR2: ## %bb.0: +; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpextrb $0, %xmm1, %eax +; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: je LBB2_2 +; AVX1OR2-NEXT: ## %bb.1: ## %cond.store +; AVX1OR2-NEXT: vmovss %xmm0, (%rdi) +; AVX1OR2-NEXT: addq $4, %rdi +; AVX1OR2-NEXT: LBB2_2: ## %else +; AVX1OR2-NEXT: vpextrb $4, %xmm1, %eax +; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: je LBB2_4 +; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1 +; AVX1OR2-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX1OR2-NEXT: LBB2_4: ## %else2 +; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: compressstore_v2f32_v2i32: ; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vcompressps %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: compressstore_v2f32_v2i32: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vcompressps %xmm0, (%rdi) {%k1} -; AVX512VL-NEXT: retq +; AVX512VLDQ-LABEL: compressstore_v2f32_v2i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: vcompressps %xmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: compressstore_v2f32_v2i32: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLBW-NEXT: vcompressps %xmm0, (%rdi) {%k1} +; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.compressstore.v2f32(<2 x float> %V, float* %base, <2 x i1> %mask) ret void Index: llvm/test/CodeGen/X86/masked_expandload.ll =================================================================== --- llvm/test/CodeGen/X86/masked_expandload.ll +++ llvm/test/CodeGen/X86/masked_expandload.ll @@ -1254,20 +1254,17 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x i32> %trigger) { ; SSE2-LABEL: expandload_v2f32_v2i1: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB4_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: LBB4_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB4_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 @@ -1281,17 +1278,16 @@ ; SSE42-LABEL: expandload_v2f32_v2i1: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE42-NEXT: pcmpeqq %xmm2, %xmm1 -; SSE42-NEXT: pextrb $0, %xmm1, %eax +; SSE42-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE42-NEXT: pextrb $0, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB4_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] +; SSE42-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE42-NEXT: addq $4, %rdi ; SSE42-NEXT: LBB4_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm1, %eax +; SSE42-NEXT: pextrb $4, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB4_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 @@ -1302,8 +1298,7 @@ ; AVX1-LABEL: expandload_v2f32_v2i1: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB4_2 @@ -1312,7 +1307,7 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: LBB4_2: ## %else -; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: vpextrb $4, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB4_4 ; AVX1-NEXT: ## %bb.3: ## %cond.load1 @@ -1323,8 +1318,7 @@ ; AVX2-LABEL: expandload_v2f32_v2i1: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB4_2 @@ -1333,7 +1327,7 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB4_2: ## %else -; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: vpextrb $4, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB4_4 ; AVX2-NEXT: ## %bb.3: ## %cond.load1 @@ -1343,10 +1337,9 @@ ; ; AVX512F-LABEL: expandload_v2f32_v2i1: ; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} @@ -1354,13 +1347,21 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: expandload_v2f32_v2i1: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vexpandps (%rdi), %xmm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VLDQ-LABEL: expandload_v2f32_v2i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: vexpandps (%rdi), %xmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: expandload_v2f32_v2i1: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLBW-NEXT: vexpandps (%rdi), %xmm0 {%k1} +; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.expandload.v2f32(float* %base, <2 x i1> %mask, <2 x float> %src0) ret <2 x float> %res Index: llvm/test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -912,13 +912,12 @@ ; KNL_64-LABEL: test17: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovapd %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -926,36 +925,31 @@ ; KNL_32-LABEL: test17: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test17: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovapd %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test17: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovapd %xmm2, %xmm0 ; SKX_32-NEXT: retl @@ -1077,8 +1071,8 @@ ; ; KNL_32-LABEL: test20: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1096,7 +1090,6 @@ ; ; SKX_32-LABEL: test20: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} @@ -1110,9 +1103,9 @@ ; KNL_64-LABEL: test21: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1121,10 +1114,10 @@ ; ; KNL_32-LABEL: test21: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1135,7 +1128,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq ; @@ -1143,8 +1135,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) @@ -1158,7 +1148,7 @@ ; KNL_64-LABEL: test22: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 @@ -1171,7 +1161,7 @@ ; KNL_32-LABEL: test22: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 @@ -1184,7 +1174,6 @@ ; ; SKX-LABEL: test22: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} @@ -1193,7 +1182,6 @@ ; ; SKX_32-LABEL: test22: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1261,28 +1249,28 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1290,10 +1278,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23: @@ -1301,10 +1287,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1315,28 +1299,28 @@ define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23b: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23b: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1344,9 +1328,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23b: @@ -1354,9 +1337,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) @@ -1366,22 +1348,22 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test24: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1389,9 +1371,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test24: @@ -1399,9 +1380,8 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1413,13 +1393,12 @@ ; KNL_64-LABEL: test25: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1427,36 +1406,31 @@ ; KNL_32-LABEL: test25: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test25: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test25: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1469,11 +1443,10 @@ ; KNL_64-LABEL: test26: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1481,32 +1454,27 @@ ; KNL_32-LABEL: test26: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movb $3, %cl ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: ; SKX: # %bb.0: -; SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1} ; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test26: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0 -; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1} ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1519,40 +1487,40 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test27: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: ; SKX: # %bb.0: -; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} +; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test27: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movb $3, %cl ; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} +; SKX_32-NEXT: vmovaps %xmm1, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind @@ -1565,7 +1533,7 @@ ; KNL_64-LABEL: test28: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1574,8 +1542,8 @@ ; ; KNL_32-LABEL: test28: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: movw $3, %ax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} @@ -1584,7 +1552,6 @@ ; ; SKX-LABEL: test28: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; SKX-NEXT: retq @@ -1593,8 +1560,6 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movb $3, %al ; SKX_32-NEXT: kmovw %eax, %k1 -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) @@ -2611,9 +2576,7 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) { ; KNL_64-LABEL: sext_v8i8_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0 -; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm1 +; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1 ; KNL_64-NEXT: movw $255, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} @@ -2622,10 +2585,8 @@ ; ; KNL_32-LABEL: sext_v8i8_index: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0 -; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm1 +; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1 ; KNL_32-NEXT: movw $255, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} @@ -2634,20 +2595,16 @@ ; ; SKX-LABEL: sext_v8i8_index: ; SKX: # %bb.0: -; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SKX-NEXT: vpmovsxbd %xmm0, %ymm1 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpslld $24, %ymm0, %ymm0 -; SKX-NEXT: vpsrad $24, %ymm0, %ymm1 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: sext_v8i8_index: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0 -; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; SKX_32-NEXT: retl @@ -2663,28 +2620,26 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) { ; KNL_64-LABEL: test_scatter_2i32_index: ; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1} +; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_2i32_index: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1} +; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -2692,19 +2647,15 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX-NEXT: vpsraq $32, %xmm1, %xmm1 -; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1} +; SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_2i32_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1 -; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1} +; SKX_32-NEXT: vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1} ; SKX_32-NEXT: retl %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) Index: llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll =================================================================== --- llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -30,24 +30,21 @@ ; ; PROMOTE_SKX-LABEL: test_gather_v2i32_index: ; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $32, %xmm0, %xmm0 -; PROMOTE_SKX-NEXT: vpsraq $32, %xmm0, %xmm0 ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} ; PROMOTE_SKX-NEXT: vmovapd %xmm2, %xmm0 ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_gather_v2i32_index: ; PROMOTE_KNL: # %bb.0: ; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; PROMOTE_KNL-NEXT: vpsllq $32, %xmm0, %xmm0 -; PROMOTE_KNL-NEXT: vpsraq $32, %zmm0, %zmm0 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; PROMOTE_KNL-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} ; PROMOTE_KNL-NEXT: vmovapd %xmm2, %xmm0 ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq @@ -61,11 +58,8 @@ ; ; PROMOTE_AVX2-LABEL: test_gather_v2i32_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm0, %xmm3 -; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vgatherqpd %xmm1, (%rdi,%xmm0,8), %xmm2 +; PROMOTE_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2 ; PROMOTE_AVX2-NEXT: vmovapd %xmm2, %xmm0 ; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr double, double* %base, <2 x i32> %ind @@ -97,21 +91,18 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpsllq $32, %xmm1, %xmm1 -; PROMOTE_SKX-NEXT: vpsraq $32, %xmm1, %xmm1 -; PROMOTE_SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1} +; PROMOTE_SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_scatter_v2i32_index: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; PROMOTE_KNL-NEXT: vpsllq $32, %xmm1, %xmm1 -; PROMOTE_KNL-NEXT: vpsraq $32, %zmm1, %zmm1 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1} +; PROMOTE_KNL-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq ; @@ -140,9 +131,7 @@ ; ; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm1, %xmm3 -; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] +; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpsllq $3, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 ; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 @@ -193,21 +182,20 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm1 {%k1} -; PROMOTE_SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_gather_v2i32_data: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm1 {%k1} -; PROMOTE_KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} +; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq ; @@ -221,11 +209,10 @@ ; ; PROMOTE_AVX2-LABEL: test_gather_v2i32_data: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2 -; PROMOTE_AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_AVX2-NEXT: retq %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32>%res @@ -255,16 +242,15 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; PROMOTE_SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_scatter_v2i32_data: ; PROMOTE_KNL: # %bb.0: ; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 ; PROMOTE_KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -303,7 +289,7 @@ ; PROMOTE_AVX2-NEXT: je .LBB3_4 ; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2 ; PROMOTE_AVX2-NEXT: retq call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) @@ -336,22 +322,20 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; PROMOTE_SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_gather_v2i32_data_index: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; PROMOTE_KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_KNL-NEXT: vzeroupper ; PROMOTE_KNL-NEXT: retq ; @@ -365,12 +349,10 @@ ; ; PROMOTE_AVX2-LABEL: test_gather_v2i32_data_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; PROMOTE_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2 -; PROMOTE_AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr i32, i32* %base, <2 x i32> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) @@ -401,17 +383,15 @@ ; PROMOTE_SKX: # %bb.0: ; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; PROMOTE_SKX-NEXT: vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1} ; PROMOTE_SKX-NEXT: retq ; ; PROMOTE_KNL-LABEL: test_scatter_v2i32_data_index: ; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 ; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 ; PROMOTE_KNL-NEXT: vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1} @@ -443,9 +423,7 @@ ; ; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm1, %xmm3 -; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] +; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vpsllq $2, %xmm1, %xmm1 ; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 ; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 @@ -462,7 +440,7 @@ ; PROMOTE_AVX2-NEXT: je .LBB5_4 ; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2 ; PROMOTE_AVX2-NEXT: retq %gep = getelementptr i32, i32 *%base, <2 x i32> %ind Index: llvm/test/CodeGen/X86/masked_load.ll =================================================================== --- llvm/test/CodeGen/X86/masked_load.ll +++ llvm/test/CodeGen/X86/masked_load.ll @@ -572,38 +572,40 @@ ; ; AVX1-LABEL: load_v8f64_v8i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpmovsxdq %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm3 -; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0 -; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm4, %ymm1 -; AVX1-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 +; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v8f64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm3 -; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0 -; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm4, %ymm1 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 +; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 +; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v8f64_v8i16: @@ -820,19 +822,16 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { ; SSE2-LABEL: load_v2f32_v2i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB7_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: LBB7_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB7_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 @@ -847,16 +846,15 @@ ; SSE42-LABEL: load_v2f32_v2i32: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE42-NEXT: pextrb $0, %xmm0, %eax +; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE42-NEXT: pextrb $0, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB7_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: LBB7_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm0, %eax +; SSE42-NEXT: pextrb $4, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB7_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 @@ -865,32 +863,20 @@ ; SSE42-NEXT: movaps %xmm1, %xmm0 ; SSE42-NEXT: retq ; -; AVX1-LABEL: load_v2f32_v2i32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_v2f32_v2i32: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX1OR2-LABEL: load_v2f32_v2i32: +; AVX1OR2: ## %bb.0: +; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: load_v2f32_v2i32: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} @@ -898,13 +884,21 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: load_v2f32_v2i32: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VLDQ-LABEL: load_v2f32_v2i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: load_v2f32_v2i32: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) ret <2 x float> %res @@ -913,11 +907,8 @@ define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %addr) { ; SSE2-LABEL: load_v2f32_v2i32_undef: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: ## implicit-def: $xmm0 @@ -925,7 +916,7 @@ ; SSE2-NEXT: ## %bb.1: ## %cond.load ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: LBB8_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 @@ -938,10 +929,8 @@ ; ; SSE42-LABEL: load_v2f32_v2i32_undef: ; SSE42: ## %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE42-NEXT: pextrb $0, %xmm1, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: ## implicit-def: $xmm0 @@ -949,7 +938,7 @@ ; SSE42-NEXT: ## %bb.1: ## %cond.load ; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE42-NEXT: LBB8_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm1, %eax +; SSE42-NEXT: pextrb $4, %xmm1, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB8_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 @@ -957,29 +946,18 @@ ; SSE42-NEXT: LBB8_4: ## %else2 ; SSE42-NEXT: retq ; -; AVX1-LABEL: load_v2f32_v2i32_undef: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_v2f32_v2i32_undef: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX1OR2-LABEL: load_v2f32_v2i32_undef: +; AVX1OR2: ## %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: load_v2f32_v2i32_undef: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} @@ -987,13 +965,21 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: load_v2f32_v2i32_undef: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512VLDQ-LABEL: load_v2f32_v2i32_undef: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: load_v2f32_v2i32_undef: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} +; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float>undef) ret <2 x float> %res @@ -1830,38 +1816,40 @@ ; ; AVX1-LABEL: load_v8i64_v8i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpmovsxdq %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm3 -; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0 -; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm4, %ymm1 -; AVX1-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 +; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v8i64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm3 -; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0 -; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm4, %ymm1 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4 +; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 +; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v8i64_v8i16: @@ -2082,48 +2070,42 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; SSE2-LABEL: load_v2i32_v2i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB17_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: LBB17_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB17_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movl 4(%rdi), %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: LBB17_4: ## %else2 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v2i32_v2i32: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE42-NEXT: pextrb $0, %xmm0, %eax +; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE42-NEXT: pextrb $0, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB17_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movl (%rdi), %eax -; SSE42-NEXT: pinsrq $0, %rax, %xmm1 +; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1 ; SSE42-NEXT: LBB17_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm0, %eax +; SSE42-NEXT: pextrb $4, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB17_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movl 4(%rdi), %eax -; SSE42-NEXT: pinsrq $1, %rax, %xmm1 +; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 ; SSE42-NEXT: LBB17_4: ## %else2 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq @@ -2131,49 +2113,48 @@ ; AVX1-LABEL: load_v2i32_v2i32: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v2i32_v2i32: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v2i32_v2i32: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: load_v2i32_v2i32: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: retq +; AVX512VLDQ-LABEL: load_v2i32_v2i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: load_v2i32_v2i32: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ret <2 x i32> %res Index: llvm/test/CodeGen/X86/masked_store.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store.ll +++ llvm/test/CodeGen/X86/masked_store.ll @@ -234,18 +234,15 @@ define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; SSE2-LABEL: store_v2f32_v2i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB3_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: LBB3_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB3_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 @@ -257,15 +254,14 @@ ; SSE4-LABEL: store_v2f32_v2i32: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE4-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax +; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB3_2 ; SSE4-NEXT: ## %bb.1: ## %cond.store ; SSE4-NEXT: movss %xmm1, (%rdi) ; SSE4-NEXT: LBB3_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm0, %eax +; SSE4-NEXT: pextrb $4, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB3_4 ; SSE4-NEXT: ## %bb.3: ## %cond.store1 @@ -273,43 +269,40 @@ ; SSE4-NEXT: LBB3_4: ## %else2 ; SSE4-NEXT: retq ; -; AVX1-LABEL: store_v2f32_v2i32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_v2f32_v2i32: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX1OR2-LABEL: store_v2f32_v2i32: +; AVX1OR2: ## %bb.0: +; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1OR2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: store_v2f32_v2i32: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: store_v2f32_v2i32: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1} -; AVX512VL-NEXT: retq +; AVX512VLDQ-LABEL: store_v2f32_v2i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: vmovups %xmm1, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: store_v2f32_v2i32: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1} +; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) ret void @@ -1127,22 +1120,19 @@ define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { ; SSE2-LABEL: store_v2i32_v2i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB10_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movd %xmm1, (%rdi) ; SSE2-NEXT: LBB10_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB10_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 4(%rdi) ; SSE2-NEXT: LBB10_4: ## %else2 ; SSE2-NEXT: retq @@ -1150,61 +1140,63 @@ ; SSE4-LABEL: store_v2i32_v2i32: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE4-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax +; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB10_2 ; SSE4-NEXT: ## %bb.1: ## %cond.store ; SSE4-NEXT: movss %xmm1, (%rdi) ; SSE4-NEXT: LBB10_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm0, %eax +; SSE4-NEXT: pextrb $4, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB10_4 ; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: extractps $2, %xmm1, 4(%rdi) +; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) ; SSE4-NEXT: LBB10_4: ## %else2 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v2i32_v2i32: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_v2i32_v2i32: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: store_v2i32_v2i32: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: store_v2i32_v2i32: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vpmovqd %xmm1, (%rdi) {%k1} -; AVX512VL-NEXT: retq +; AVX512VLDQ-LABEL: store_v2i32_v2i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: store_v2i32_v2i32: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} +; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ret void Index: llvm/test/CodeGen/X86/masked_store_trunc.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc.ll +++ llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -651,17 +651,15 @@ ; SSE2-NEXT: pxor %xmm7, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: packssdw %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm7, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %eax @@ -675,25 +673,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB2_4: # %else2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB2_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB2_8: # %else6 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -702,17 +700,16 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pextrw $0, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: je .LBB2_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB2_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB2_12: # %else10 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 @@ -720,34 +717,34 @@ ; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: je .LBB2_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB2_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB2_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa %xmm4, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE4-NEXT: pxor %xmm7, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm7 ; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE4-NEXT: pxor %xmm7, %xmm6 -; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2,3],xmm3[4],xmm8[5,6,7] -; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2,3],xmm2[4],xmm8[5,6,7] +; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE4-NEXT: pand %xmm7, %xmm3 +; SSE4-NEXT: pand %xmm7, %xmm2 ; SSE4-NEXT: packusdw %xmm3, %xmm2 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1,2,3],xmm1[4],xmm8[5,6,7] -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2,3],xmm0[4],xmm8[5,6,7] +; SSE4-NEXT: pand %xmm7, %xmm1 +; SSE4-NEXT: pand %xmm7, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 ; SSE4-NEXT: packusdw %xmm2, %xmm0 +; SSE4-NEXT: packuswb %xmm0, %xmm0 ; SSE4-NEXT: pextrb $0, %xmm6, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_2 @@ -758,7 +755,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB2_4: # %else2 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 @@ -768,13 +765,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB2_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB2_8: # %else6 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 @@ -784,13 +781,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB2_10: # %else8 ; SSE4-NEXT: pextrb $4, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) ; SSE4-NEXT: .LBB2_12: # %else10 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 @@ -800,13 +797,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB2_14: # %else12 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: .LBB2_16: # %else14 ; SSE4-NEXT: retq ; @@ -816,7 +813,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 @@ -824,6 +821,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-NEXT: vpackusdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm5, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_2 @@ -836,7 +834,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB2_4: # %else2 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 @@ -846,7 +844,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB2_6: # %else4 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 @@ -854,7 +852,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB2_8: # %else6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -865,13 +863,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_10 ; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB2_10: # %else8 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_12 ; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX1-NEXT: .LBB2_12: # %else10 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -881,13 +879,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_14 ; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB2_14: # %else12 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_16 ; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX1-NEXT: .LBB2_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -898,13 +896,17 @@ ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpextrb $0, %xmm5, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_2 @@ -917,7 +919,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB2_4: # %else2 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 @@ -927,7 +929,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB2_6: # %else4 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 @@ -935,7 +937,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB2_8: # %else6 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -946,13 +948,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_10 ; AVX2-NEXT: # %bb.9: # %cond.store7 -; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB2_10: # %else8 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_12 ; AVX2-NEXT: # %bb.11: # %cond.store9 -; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX2-NEXT: .LBB2_12: # %else10 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -962,13 +964,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_14 ; AVX2-NEXT: # %bb.13: # %cond.store11 -; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB2_14: # %else12 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_16 ; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX2-NEXT: .LBB2_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -977,7 +979,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_2 @@ -990,7 +992,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB2_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -998,7 +1000,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB2_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -1006,7 +1008,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB2_8: # %else6 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $4, %k0, %k0 @@ -1014,7 +1016,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB2_10: # %else8 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $5, %k0, %k0 @@ -1022,7 +1024,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB2_12: # %else10 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $6, %k0, %k0 @@ -1030,7 +1032,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB2_14: # %else12 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $7, %k0, %k0 @@ -1038,7 +1040,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB2_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1208,7 +1210,11 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -1223,7 +1229,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB4_4: # %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -1233,7 +1239,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: .LBB4_6: # %else4 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 @@ -1242,7 +1248,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) ; SSE2-NEXT: .LBB4_8: # %else6 ; SSE2-NEXT: retq @@ -1253,7 +1259,11 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE4-NEXT: pxor %xmm4, %xmm3 -; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE4-NEXT: pextrb $0, %xmm3, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_2 @@ -1264,9 +1274,9 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB4_4: # %else2 -; SSE4-NEXT: xorps %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE4-NEXT: pxor %xmm2, %xmm1 @@ -1274,61 +1284,108 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB4_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB4_8: # %else6 ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v4i64_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB4_2 -; AVX-NEXT: # %bb.1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB4_2: # %else -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB4_4 -; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB4_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB4_6 -; AVX-NEXT: # %bb.5: # %cond.store3 -; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX-NEXT: .LBB4_6: # %else4 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB4_8 -; AVX-NEXT: # %bb.7: # %cond.store5 -; AVX-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX-NEXT: .LBB4_8: # %else6 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v4i64_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB4_2 +; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: .LBB4_2: # %else +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB4_4 +; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: .LBB4_4: # %else2 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB4_6 +; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX1-NEXT: .LBB4_6: # %else4 +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB4_8 +; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX1-NEXT: .LBB4_8: # %else6 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v4i64_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB4_2 +; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: .LBB4_2: # %else +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB4_4 +; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: .LBB4_4: # %else2 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB4_6 +; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX2-NEXT: .LBB4_6: # %else4 +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB4_8 +; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX2-NEXT: .LBB4_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i64_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_2 @@ -1341,7 +1398,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB4_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -1349,7 +1406,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB4_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -1357,7 +1414,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB4_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1367,10 +1424,9 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1392,41 +1448,46 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: je .LBB5_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB5_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB5_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB5_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB5_8: # %else6 ; SSE2-NEXT: retq @@ -1437,7 +1498,10 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE4-NEXT: pxor %xmm4, %xmm3 -; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE4-NEXT: pshufb %xmm4, %xmm1 +; SSE4-NEXT: pshufb %xmm4, %xmm0 +; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE4-NEXT: pextrb $0, %xmm3, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_2 @@ -1448,9 +1512,9 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB5_4: # %else2 -; SSE4-NEXT: xorps %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE4-NEXT: pxor %xmm2, %xmm1 @@ -1458,61 +1522,106 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB5_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB5_8: # %else6 ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v4i64_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_2 -; AVX-NEXT: # %bb.1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB5_2: # %else -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_4 -; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB5_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_6 -; AVX-NEXT: # %bb.5: # %cond.store3 -; AVX-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB5_6: # %else4 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_8 -; AVX-NEXT: # %bb.7: # %cond.store5 -; AVX-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX-NEXT: .LBB5_8: # %else6 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v4i64_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB5_2 +; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: .LBB5_2: # %else +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB5_4 +; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: .LBB5_4: # %else2 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB5_6 +; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX1-NEXT: .LBB5_6: # %else4 +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je .LBB5_8 +; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX1-NEXT: .LBB5_8: # %else6 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v4i64_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB5_2 +; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: .LBB5_2: # %else +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB5_4 +; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: .LBB5_4: # %else2 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB5_6 +; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX2-NEXT: .LBB5_6: # %else4 +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je .LBB5_8 +; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX2-NEXT: .LBB5_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i64_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_2 @@ -1525,7 +1634,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB5_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -1533,7 +1642,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB5_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -1541,7 +1650,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB5_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1551,10 +1660,9 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1578,6 +1686,7 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -1591,7 +1700,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB6_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 4(%rdi) ; SSE2-NEXT: .LBB6_4: # %else2 ; SSE2-NEXT: retq @@ -1602,17 +1711,18 @@ ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB6_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: movss %xmm0, (%rdi) +; SSE4-NEXT: movd %xmm0, (%rdi) ; SSE4-NEXT: .LBB6_2: # %else ; SSE4-NEXT: pextrb $8, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB6_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: extractps $2, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB6_4: # %else2 ; SSE4-NEXT: retq ; @@ -1642,9 +1752,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1659,9 +1769,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1678,6 +1788,8 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -1692,7 +1804,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB7_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB7_4: # %else2 ; SSE2-NEXT: retq @@ -1703,6 +1815,8 @@ ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB7_2 @@ -1713,7 +1827,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB7_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB7_4: # %else2 ; SSE4-NEXT: retq ; @@ -1723,6 +1837,8 @@ ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpextrb $0, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB7_2 @@ -1733,7 +1849,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB7_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB7_4: # %else2 ; AVX-NEXT: retq ; @@ -1741,6 +1857,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB7_2 @@ -1753,7 +1871,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB7_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB7_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1762,10 +1880,10 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512BW-NEXT: kshiftld $30, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1788,22 +1906,25 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: je .LBB8_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB8_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB8_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; @@ -1813,6 +1934,7 @@ ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB8_2 @@ -1823,7 +1945,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB8_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $8, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB8_4: # %else2 ; SSE4-NEXT: retq ; @@ -1833,6 +1955,7 @@ ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB8_2 @@ -1843,7 +1966,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB8_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi) +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX-NEXT: .LBB8_4: # %else2 ; AVX-NEXT: retq ; @@ -1851,6 +1974,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB8_2 @@ -1863,7 +1987,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB8_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB8_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1872,9 +1996,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: kshiftlq $62, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3920,11 +4044,11 @@ ; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: packssdw %xmm0, %xmm5 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %eax @@ -3938,25 +4062,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB12_4: # %else2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB12_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB12_8: # %else6 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -3965,17 +4089,16 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pextrw $0, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: je .LBB12_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB12_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB12_12: # %else10 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 @@ -3983,17 +4106,16 @@ ; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: je .LBB12_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB12_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB12_16: # %else14 ; SSE2-NEXT: retq ; @@ -4003,10 +4125,10 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE4-NEXT: pxor %xmm5, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; SSE4-NEXT: pshufb %xmm5, %xmm1 ; SSE4-NEXT: pshufb %xmm5, %xmm0 -; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE4-NEXT: pextrb $0, %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_2 @@ -4017,7 +4139,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB12_4: # %else2 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 @@ -4027,13 +4149,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB12_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB12_8: # %else6 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 @@ -4043,13 +4165,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB12_10: # %else8 ; SSE4-NEXT: pextrb $4, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) ; SSE4-NEXT: .LBB12_12: # %else10 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 @@ -4059,13 +4181,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB12_14: # %else12 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: .LBB12_16: # %else14 ; SSE4-NEXT: retq ; @@ -4076,10 +4198,10 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX1-NEXT: vpextrb $0, %xmm4, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_2 @@ -4092,7 +4214,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB12_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 @@ -4102,7 +4224,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB12_6: # %else4 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 @@ -4110,7 +4232,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB12_8: # %else6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4121,13 +4243,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_10 ; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB12_10: # %else8 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_12 ; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX1-NEXT: .LBB12_12: # %else10 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -4137,13 +4259,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_14 ; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB12_14: # %else12 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_16 ; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX1-NEXT: .LBB12_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -4154,8 +4276,11 @@ ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX2-NEXT: vpextrb $0, %xmm4, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_2 @@ -4168,7 +4293,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB12_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 @@ -4178,7 +4303,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB12_6: # %else4 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 @@ -4186,7 +4311,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB12_8: # %else6 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4197,13 +4322,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_10 ; AVX2-NEXT: # %bb.9: # %cond.store7 -; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB12_10: # %else8 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_12 ; AVX2-NEXT: # %bb.11: # %cond.store9 -; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX2-NEXT: .LBB12_12: # %else10 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -4213,13 +4338,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_14 ; AVX2-NEXT: # %bb.13: # %cond.store11 -; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB12_14: # %else12 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_16 ; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX2-NEXT: .LBB12_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4229,7 +4354,7 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_2 @@ -4242,7 +4367,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB12_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -4250,7 +4375,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB12_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -4258,7 +4383,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB12_8: # %else6 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $4, %k0, %k0 @@ -4266,7 +4391,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB12_10: # %else8 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $5, %k0, %k0 @@ -4274,7 +4399,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB12_12: # %else10 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $6, %k0, %k0 @@ -4282,7 +4407,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB12_14: # %else12 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $7, %k0, %k0 @@ -4290,7 +4415,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB12_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -4300,10 +4425,9 @@ ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4325,6 +4449,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -4339,7 +4466,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB13_4: # %else2 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -4349,7 +4476,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: .LBB13_6: # %else4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 @@ -4358,7 +4485,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) ; SSE2-NEXT: .LBB13_8: # %else6 ; SSE2-NEXT: retq @@ -4369,6 +4496,7 @@ ; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_2 @@ -4379,7 +4507,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB13_4: # %else2 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 @@ -4389,13 +4517,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB13_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB13_8: # %else6 ; SSE4-NEXT: retq ; @@ -4405,6 +4533,7 @@ ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX-NEXT: vpextrb $0, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB13_2 @@ -4415,7 +4544,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB13_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB13_4: # %else2 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -4425,13 +4554,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB13_6 ; AVX-NEXT: # %bb.5: # %cond.store3 -; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX-NEXT: .LBB13_6: # %else4 ; AVX-NEXT: vpextrb $12, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB13_8 ; AVX-NEXT: # %bb.7: # %cond.store5 -; AVX-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX-NEXT: .LBB13_8: # %else6 ; AVX-NEXT: retq ; @@ -4439,6 +4568,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_2 @@ -4451,7 +4581,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB13_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -4459,7 +4589,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB13_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -4467,7 +4597,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB13_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -4476,9 +4606,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4499,40 +4629,43 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: je .LBB14_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB14_2: # %else -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB14_4: # %else2 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB14_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB14_8: # %else6 ; SSE2-NEXT: retq @@ -4543,6 +4676,7 @@ ; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_2 @@ -4553,7 +4687,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB14_4: # %else2 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 @@ -4563,13 +4697,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB14_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB14_8: # %else6 ; SSE4-NEXT: retq ; @@ -4579,6 +4713,7 @@ ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB14_2 @@ -4589,7 +4724,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB14_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX-NEXT: .LBB14_4: # %else2 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -4599,13 +4734,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB14_6 ; AVX-NEXT: # %bb.5: # %cond.store3 -; AVX-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB14_6: # %else4 ; AVX-NEXT: vpextrb $12, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB14_8 ; AVX-NEXT: # %bb.7: # %cond.store5 -; AVX-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX-NEXT: .LBB14_8: # %else6 ; AVX-NEXT: retq ; @@ -4613,6 +4748,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_2 @@ -4625,7 +4761,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB14_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -4633,7 +4769,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB14_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -4641,7 +4777,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB14_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -4650,9 +4786,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -6988,6 +7124,8 @@ ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax @@ -6999,25 +7137,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB17_4: # %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB17_6: # %else4 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB17_8: # %else6 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -7026,17 +7164,16 @@ ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: je .LBB17_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB17_10: # %else8 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB17_12: # %else10 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm2, %xmm1 @@ -7044,17 +7181,16 @@ ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: je .LBB17_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB17_14: # %else12 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB17_16: # %else14 ; SSE2-NEXT: retq ; @@ -7064,6 +7200,7 @@ ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_2 @@ -7074,7 +7211,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB17_4: # %else2 ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 @@ -7084,13 +7221,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB17_6: # %else4 ; SSE4-NEXT: pextrb $6, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB17_8: # %else6 ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 @@ -7100,13 +7237,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB17_10: # %else8 ; SSE4-NEXT: pextrb $10, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) ; SSE4-NEXT: .LBB17_12: # %else10 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqw %xmm2, %xmm1 @@ -7116,13 +7253,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB17_14: # %else12 ; SSE4-NEXT: pextrb $14, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: .LBB17_16: # %else14 ; SSE4-NEXT: retq ; @@ -7132,6 +7269,7 @@ ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_2 @@ -7142,7 +7280,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX-NEXT: .LBB17_4: # %else2 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -7152,13 +7290,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_6 ; AVX-NEXT: # %bb.5: # %cond.store3 -; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB17_6: # %else4 ; AVX-NEXT: vpextrb $6, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_8 ; AVX-NEXT: # %bb.7: # %cond.store5 -; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX-NEXT: .LBB17_8: # %else6 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -7168,13 +7306,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_10 ; AVX-NEXT: # %bb.9: # %cond.store7 -; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX-NEXT: .LBB17_10: # %else8 ; AVX-NEXT: vpextrb $10, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_12 ; AVX-NEXT: # %bb.11: # %cond.store9 -; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX-NEXT: .LBB17_12: # %else10 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 @@ -7184,13 +7322,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_14 ; AVX-NEXT: # %bb.13: # %cond.store11 -; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX-NEXT: .LBB17_14: # %else12 ; AVX-NEXT: vpextrb $14, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_16 ; AVX-NEXT: # %bb.15: # %cond.store13 -; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX-NEXT: .LBB17_16: # %else14 ; AVX-NEXT: retq ; @@ -7202,6 +7340,7 @@ ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 ; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 ; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_2 @@ -7216,7 +7355,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB17_4: # %else2 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -7229,7 +7368,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB17_6: # %else4 ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 @@ -7239,7 +7378,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB17_8: # %else6 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -7252,7 +7391,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB17_10: # %else8 ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 @@ -7262,7 +7401,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB17_12: # %else10 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 @@ -7275,7 +7414,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB17_14: # %else12 ; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 @@ -7285,7 +7424,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB17_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7294,9 +7433,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmw %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -990,7 +990,7 @@ ; SSE2-NEXT: packssdw %xmm0, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm6 ; SSE2-NEXT: pxor %xmm12, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] ; SSE2-NEXT: movdqa %xmm11, %xmm7 @@ -1001,23 +1001,10 @@ ; SSE2-NEXT: pand %xmm13, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm2 +; SSE2-NEXT: pand %xmm14, %xmm3 ; SSE2-NEXT: pandn %xmm10, %xmm14 -; SSE2-NEXT: por %xmm2, %xmm14 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm14 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm12, %xmm3 ; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 @@ -1027,78 +1014,97 @@ ; SSE2-NEXT: pand %xmm13, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm12, %xmm2 ; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm10, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm10, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm12, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm13, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm6 ; SSE2-NEXT: pandn %xmm10, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm12, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm14, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm14 ; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm14 -; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: por %xmm14, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: por %xmm14, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %eax @@ -1112,25 +1118,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB2_4: # %else2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB2_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB2_8: # %else6 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -1139,17 +1145,16 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pextrw $0, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: je .LBB2_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB2_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB2_12: # %else10 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 @@ -1157,17 +1162,16 @@ ; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: je .LBB2_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB2_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB2_16: # %else14 ; SSE2-NEXT: retq ; @@ -1180,52 +1184,58 @@ ; SSE4-NEXT: pxor %xmm0, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [127,127] ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm10 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10 -; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm2 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE4-NEXT: movdqa %xmm7, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: movdqa %xmm7, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE4-NEXT: movdqa %xmm7, %xmm2 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm7 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE4-NEXT: movapd %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm6 ; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6 -; SSE4-NEXT: movapd %xmm3, %xmm0 +; SSE4-NEXT: movapd %xmm2, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE4-NEXT: packssdw %xmm6, %xmm7 -; SSE4-NEXT: movapd %xmm2, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm7 +; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: movdqa %xmm1, %xmm2 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1 -; SSE4-NEXT: packssdw %xmm3, %xmm1 -; SSE4-NEXT: packssdw %xmm1, %xmm7 +; SSE4-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE4-NEXT: andpd %xmm0, %xmm1 +; SSE4-NEXT: andpd %xmm0, %xmm2 +; SSE4-NEXT: packusdw %xmm1, %xmm2 +; SSE4-NEXT: andpd %xmm0, %xmm7 +; SSE4-NEXT: andpd %xmm0, %xmm6 +; SSE4-NEXT: packusdw %xmm7, %xmm6 +; SSE4-NEXT: packusdw %xmm2, %xmm6 +; SSE4-NEXT: packuswb %xmm6, %xmm6 ; SSE4-NEXT: pextrb $0, %xmm8, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrb $0, %xmm7, (%rdi) +; SSE4-NEXT: pextrb $0, %xmm6, (%rdi) ; SSE4-NEXT: .LBB2_2: # %else ; SSE4-NEXT: pextrb $4, %xmm8, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm7, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm6, 1(%rdi) ; SSE4-NEXT: .LBB2_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 +; SSE4-NEXT: xorpd %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE4-NEXT: pxor %xmm4, %xmm0 @@ -1233,13 +1243,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm7, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm6, 2(%rdi) ; SSE4-NEXT: .LBB2_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm7, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm6, 3(%rdi) ; SSE4-NEXT: .LBB2_8: # %else6 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm5, %xmm1 @@ -1249,13 +1259,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm7, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm6, 4(%rdi) ; SSE4-NEXT: .LBB2_10: # %else8 ; SSE4-NEXT: pextrb $4, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm7, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm6, 5(%rdi) ; SSE4-NEXT: .LBB2_12: # %else10 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 @@ -1265,13 +1275,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm7, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm6, 6(%rdi) ; SSE4-NEXT: .LBB2_14: # %else12 ; SSE4-NEXT: pextrb $12, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm7, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm6, 7(%rdi) ; SSE4-NEXT: .LBB2_16: # %else14 ; SSE4-NEXT: retq ; @@ -1280,31 +1290,43 @@ ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 ; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm9, %xmm5, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm7, %xmm12 +; AVX1-NEXT: vpxor %xmm9, %xmm5, %xmm12 +; AVX1-NEXT: vmovapd {{.*#+}} ymm13 = [127,127,127,127] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm11 +; AVX1-NEXT: vblendvpd %ymm11, %ymm1, %ymm13, %ymm11 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm7, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm13 -; AVX1-NEXT: vblendvpd %xmm3, %xmm5, %xmm7, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm14 -; AVX1-NEXT: vblendvpd %xmm12, %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm11, %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm14, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vblendvpd %xmm13, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm10, %eax +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm14 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm14, %ymm15 +; AVX1-NEXT: vblendvpd %ymm15, %ymm0, %ymm13, %ymm13 +; AVX1-NEXT: vmovapd {{.*#+}} ymm15 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vblendvpd %xmm14, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm13, %ymm15, %ymm0 +; AVX1-NEXT: vblendvpd %xmm4, %xmm7, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vblendvpd %xmm10, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm11, %ymm15, %ymm1 +; AVX1-NEXT: vmovapd {{.*#+}} ymm3 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm12, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_2 ; AVX1-NEXT: # %bb.1: # %cond.store @@ -1316,7 +1338,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB2_4: # %else2 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 @@ -1326,7 +1348,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB2_6: # %else4 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 @@ -1334,7 +1356,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB2_8: # %else6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1345,13 +1367,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_10 ; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB2_10: # %else8 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_12 ; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX1-NEXT: .LBB2_12: # %else10 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -1361,13 +1383,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_14 ; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB2_14: # %else12 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_16 ; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX1-NEXT: .LBB2_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1379,19 +1401,26 @@ ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm6, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm6, %ymm7 ; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm6, %ymm7 +; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm1, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm6, %ymm0, %ymm7 ; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm1, %ymm7 +; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpextrb $0, %xmm5, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_2 @@ -1404,7 +1433,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB2_4: # %else2 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 @@ -1414,7 +1443,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB2_6: # %else4 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 @@ -1422,7 +1451,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB2_8: # %else6 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1433,13 +1462,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_10 ; AVX2-NEXT: # %bb.9: # %cond.store7 -; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB2_10: # %else8 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_12 ; AVX2-NEXT: # %bb.11: # %cond.store9 -; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX2-NEXT: .LBB2_12: # %else10 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -1449,13 +1478,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_14 ; AVX2-NEXT: # %bb.13: # %cond.store11 -; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB2_14: # %else12 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_16 ; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX2-NEXT: .LBB2_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1466,7 +1495,7 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_2 @@ -1479,7 +1508,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB2_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -1487,7 +1516,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB2_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -1495,7 +1524,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB2_8: # %else6 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $4, %k0, %k0 @@ -1503,7 +1532,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB2_10: # %else8 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $5, %k0, %k0 @@ -1511,7 +1540,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB2_12: # %else10 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $6, %k0, %k0 @@ -1519,7 +1548,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB2_14: # %else12 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $7, %k0, %k0 @@ -1527,7 +1556,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB2_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1810,7 +1839,7 @@ ; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] ; SSE2-NEXT: movdqa %xmm9, %xmm7 @@ -1821,50 +1850,54 @@ ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movd %xmm10, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -1879,7 +1912,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB4_4: # %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -1889,7 +1922,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: .LBB4_6: # %else4 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 @@ -1898,7 +1931,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) ; SSE2-NEXT: .LBB4_8: # %else6 ; SSE2-NEXT: retq @@ -1912,12 +1945,12 @@ ; SSE4-NEXT: pxor %xmm0, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [32767,32767] ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm5, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE4-NEXT: movapd %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 @@ -1926,34 +1959,38 @@ ; SSE4-NEXT: movapd %xmm6, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE4-NEXT: packssdw %xmm3, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE4-NEXT: pextrb $0, %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) +; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE4-NEXT: .LBB4_2: # %else ; SSE4-NEXT: pextrb $4, %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $2, %xmm1, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB4_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: pextrb $8, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $4, %xmm1, 4(%rdi) +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB4_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax +; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $6, %xmm1, 6(%rdi) +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB4_8: # %else6 ; SSE4-NEXT: retq ; @@ -1973,8 +2010,12 @@ ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_2 @@ -1985,7 +2026,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB4_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -1995,13 +2036,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB4_6: # %else4 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB4_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2019,7 +2060,11 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_2 @@ -2030,7 +2075,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB4_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -2040,13 +2085,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB4_6: # %else4 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB4_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2060,7 +2105,7 @@ ; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] ; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_2 @@ -2073,7 +2118,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB4_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -2081,7 +2126,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB4_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -2089,7 +2134,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB4_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2099,14 +2144,13 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $28, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: kshiftld $28, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2136,7 +2180,7 @@ ; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] ; SSE2-NEXT: movdqa %xmm9, %xmm7 @@ -2147,84 +2191,89 @@ ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm10, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: je .LBB5_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB5_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: pextrw $2, %xmm10, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm10 +; SSE2-NEXT: pextrw $2, %xmm10, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB5_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB5_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB5_8: # %else6 ; SSE2-NEXT: retq @@ -2238,21 +2287,24 @@ ; SSE4-NEXT: pxor %xmm0, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127] ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm5, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] ; SSE4-NEXT: movapd %xmm5, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE4-NEXT: movdqa %xmm3, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE4-NEXT: packssdw %xmm3, %xmm1 +; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE4-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE4-NEXT: pshufb %xmm0, %xmm3 +; SSE4-NEXT: pshufb %xmm0, %xmm1 +; SSE4-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE4-NEXT: pextrb $0, %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_2 @@ -2263,7 +2315,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $4, %xmm1, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi) ; SSE4-NEXT: .LBB5_4: # %else2 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 @@ -2273,13 +2325,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $8, %xmm1, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi) ; SSE4-NEXT: .LBB5_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $12, %xmm1, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi) ; SSE4-NEXT: .LBB5_8: # %else6 ; SSE4-NEXT: retq ; @@ -2299,8 +2351,11 @@ ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_2 @@ -2311,7 +2366,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB5_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -2321,13 +2376,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB5_6: # %else4 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB5_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2345,7 +2400,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_2 @@ -2356,7 +2414,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB5_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -2366,13 +2424,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB5_6: # %else4 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB5_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2386,7 +2444,7 @@ ; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] ; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_2 @@ -2399,7 +2457,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB5_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -2407,7 +2465,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB5_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -2415,7 +2473,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB5_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2425,14 +2483,13 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2483,13 +2540,14 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -2503,7 +2561,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB6_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 4(%rdi) ; SSE2-NEXT: .LBB6_4: # %else2 ; SSE2-NEXT: retq @@ -2523,17 +2581,18 @@ ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB6_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: movss %xmm2, (%rdi) +; SSE4-NEXT: movd %xmm0, (%rdi) ; SSE4-NEXT: .LBB6_2: # %else ; SSE4-NEXT: pextrb $8, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB6_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: extractps $2, %xmm2, 4(%rdi) +; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB6_4: # %else2 ; SSE4-NEXT: retq ; @@ -2543,6 +2602,7 @@ ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 @@ -2550,7 +2610,6 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -2560,6 +2619,7 @@ ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 @@ -2567,7 +2627,6 @@ ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -2576,13 +2635,13 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: kshiftlw $14, %k0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2600,13 +2659,13 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2648,13 +2707,15 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -2669,7 +2730,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB7_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB7_4: # %else2 ; SSE2-NEXT: retq @@ -2689,17 +2750,19 @@ ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB7_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm2, (%rdi) +; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE4-NEXT: .LBB7_2: # %else ; SSE4-NEXT: pextrb $8, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB7_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $4, %xmm2, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB7_4: # %else2 ; SSE4-NEXT: retq ; @@ -2715,6 +2778,8 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpextrb $0, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB7_2 @@ -2725,7 +2790,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB7_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB7_4: # %else2 ; AVX-NEXT: retq ; @@ -2738,6 +2803,8 @@ ; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] ; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB7_2 @@ -2750,7 +2817,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB7_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB7_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2760,14 +2827,14 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $30, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: kshiftld $30, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2817,29 +2884,32 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: je .LBB8_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB8_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB8_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; @@ -2858,6 +2928,7 @@ ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB8_2 @@ -2868,7 +2939,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB8_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $8, %xmm2, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) ; SSE4-NEXT: .LBB8_4: # %else2 ; SSE4-NEXT: retq ; @@ -2884,6 +2955,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB8_2 @@ -2894,7 +2966,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB8_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi) +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX-NEXT: .LBB8_4: # %else2 ; AVX-NEXT: retq ; @@ -2907,6 +2979,7 @@ ; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] ; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB8_2 @@ -2919,7 +2992,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB8_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB8_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2929,13 +3002,13 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $62, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: kshiftlq $62, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4967,29 +5040,8 @@ ; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: packssdw %xmm0, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127,127,127] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pandn %xmm6, %xmm7 -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm6, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: packssdw %xmm6, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %eax @@ -5003,25 +5055,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB12_4: # %else2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB12_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB12_8: # %else6 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -5030,17 +5082,16 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pextrw $0, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: je .LBB12_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB12_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB12_12: # %else10 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 @@ -5048,17 +5099,16 @@ ; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: je .LBB12_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB12_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB12_16: # %else14 ; SSE2-NEXT: retq ; @@ -5068,13 +5118,8 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE4-NEXT: pxor %xmm5, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127] -; SSE4-NEXT: pminsd %xmm5, %xmm0 -; SSE4-NEXT: pminsd %xmm5, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168] -; SSE4-NEXT: pmaxsd %xmm5, %xmm1 -; SSE4-NEXT: pmaxsd %xmm5, %xmm0 ; SSE4-NEXT: packssdw %xmm1, %xmm0 +; SSE4-NEXT: packsswb %xmm0, %xmm0 ; SSE4-NEXT: pextrb $0, %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_2 @@ -5085,7 +5130,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB12_4: # %else2 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 @@ -5095,13 +5140,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB12_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB12_8: # %else6 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 @@ -5111,13 +5156,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB12_10: # %else8 ; SSE4-NEXT: pextrb $4, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) ; SSE4-NEXT: .LBB12_12: # %else10 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 @@ -5127,13 +5172,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB12_14: # %else12 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: .LBB12_16: # %else14 ; SSE4-NEXT: retq ; @@ -5143,14 +5188,9 @@ ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127] -; AVX1-NEXT: vpminsd %xmm5, %xmm0, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminsd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168] -; AVX1-NEXT: vpmaxsd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm4, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_2 @@ -5163,7 +5203,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB12_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 @@ -5173,7 +5213,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB12_6: # %else4 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 @@ -5181,7 +5221,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB12_8: # %else6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -5192,13 +5232,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_10 ; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB12_10: # %else8 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_12 ; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX1-NEXT: .LBB12_12: # %else10 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5208,13 +5248,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_14 ; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB12_14: # %else12 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_16 ; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX1-NEXT: .LBB12_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -5225,12 +5265,9 @@ ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127] -; AVX2-NEXT: vpminsd %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm4, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_2 @@ -5243,7 +5280,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB12_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 @@ -5253,7 +5290,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB12_6: # %else4 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 @@ -5261,7 +5298,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB12_8: # %else6 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -5272,13 +5309,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_10 ; AVX2-NEXT: # %bb.9: # %cond.store7 -; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB12_10: # %else8 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_12 ; AVX2-NEXT: # %bb.11: # %cond.store9 -; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX2-NEXT: .LBB12_12: # %else10 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5288,13 +5325,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_14 ; AVX2-NEXT: # %bb.13: # %cond.store11 -; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB12_14: # %else12 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_16 ; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX2-NEXT: .LBB12_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5307,7 +5344,7 @@ ; AVX512F-NEXT: vpminsd %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] ; AVX512F-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_2 @@ -5320,7 +5357,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB12_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -5328,7 +5365,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB12_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -5336,7 +5373,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB12_8: # %else6 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $4, %k0, %k0 @@ -5344,7 +5381,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB12_10: # %else8 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $5, %k0, %k0 @@ -5352,7 +5389,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB12_12: # %else10 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $6, %k0, %k0 @@ -5360,7 +5397,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB12_14: # %else12 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $7, %k0, %k0 @@ -5368,7 +5405,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB12_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5377,14 +5414,13 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] ; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] ; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5412,18 +5448,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294934528,4294934528,4294934528,4294934528] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al @@ -5438,7 +5463,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB13_4: # %else2 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -5448,7 +5473,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: .LBB13_6: # %else4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 @@ -5457,7 +5482,7 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) ; SSE2-NEXT: .LBB13_8: # %else6 ; SSE2-NEXT: retq @@ -5468,8 +5493,7 @@ ; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pminsd {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pmaxsd {{.*}}(%rip), %xmm0 +; SSE4-NEXT: packssdw %xmm0, %xmm0 ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_2 @@ -5480,7 +5504,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB13_4: # %else2 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 @@ -5490,93 +5514,52 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB13_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB13_8: # %else6 ; SSE4-NEXT: retq ; -; AVX1-LABEL: truncstore_v4i32_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB13_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB13_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB13_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB13_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB13_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB13_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB13_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB13_8: # %else6 -; AVX1-NEXT: retq -; -; AVX2-LABEL: truncstore_v4i32_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32767,32767,32767,32767] -; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294934528,4294934528,4294934528,4294934528] -; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB13_2 -; AVX2-NEXT: # %bb.1: # %cond.store -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB13_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB13_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB13_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB13_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB13_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB13_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB13_8: # %else6 -; AVX2-NEXT: retq +; AVX-LABEL: truncstore_v4i32_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je .LBB13_2 +; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: .LBB13_2: # %else +; AVX-NEXT: vpextrb $4, %xmm2, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je .LBB13_4 +; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX-NEXT: .LBB13_4: # %else2 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $8, %xmm1, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je .LBB13_6 +; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX-NEXT: .LBB13_6: # %else4 +; AVX-NEXT: vpextrb $12, %xmm1, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je .LBB13_8 +; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX-NEXT: .LBB13_8: # %else6 +; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i16: ; AVX512F: # %bb.0: @@ -5586,6 +5569,7 @@ ; AVX512F-NEXT: vpminsd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294934528,4294934528,4294934528,4294934528] ; AVX512F-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_2 @@ -5598,7 +5582,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB13_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -5606,7 +5590,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB13_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -5614,7 +5598,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB13_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5623,13 +5607,13 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $28, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] ; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] ; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: kshiftld $28, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5662,46 +5646,49 @@ ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: je .LBB14_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB14_2: # %else -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB14_4: # %else2 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB14_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB14_8: # %else6 ; SSE2-NEXT: retq @@ -5714,6 +5701,7 @@ ; SSE4-NEXT: pxor %xmm3, %xmm2 ; SSE4-NEXT: pminsd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: pmaxsd {{.*}}(%rip), %xmm0 +; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_2 @@ -5724,7 +5712,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB14_4: # %else2 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 @@ -5734,13 +5722,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB14_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB14_8: # %else6 ; SSE4-NEXT: retq ; @@ -5752,6 +5740,7 @@ ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_2 @@ -5762,7 +5751,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB14_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5772,13 +5761,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB14_6: # %else4 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB14_8: # %else6 ; AVX1-NEXT: retq ; @@ -5792,6 +5781,7 @@ ; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168] ; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_2 @@ -5802,7 +5792,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB14_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5812,13 +5802,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB14_6: # %else4 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB14_8: # %else6 ; AVX2-NEXT: retq ; @@ -5830,6 +5820,7 @@ ; AVX512F-NEXT: vpminsd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168] ; AVX512F-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_2 @@ -5842,7 +5833,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB14_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -5850,7 +5841,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB14_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -5858,7 +5849,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB14_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5867,13 +5858,13 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] ; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] ; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -8212,8 +8203,7 @@ ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax @@ -8225,25 +8215,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB17_4: # %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB17_6: # %else4 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB17_8: # %else6 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -8252,17 +8242,16 @@ ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: je .LBB17_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB17_10: # %else8 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB17_12: # %else10 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm2, %xmm1 @@ -8270,17 +8259,16 @@ ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: je .LBB17_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB17_14: # %else12 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB17_16: # %else14 ; SSE2-NEXT: retq ; @@ -8290,8 +8278,7 @@ ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pminsw {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pmaxsw {{.*}}(%rip), %xmm0 +; SSE4-NEXT: packsswb %xmm0, %xmm0 ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_2 @@ -8302,7 +8289,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB17_4: # %else2 ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 @@ -8312,13 +8299,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB17_6: # %else4 ; SSE4-NEXT: pextrb $6, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB17_8: # %else6 ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 @@ -8328,13 +8315,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB17_10: # %else8 ; SSE4-NEXT: pextrb $10, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) ; SSE4-NEXT: .LBB17_12: # %else10 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqw %xmm2, %xmm1 @@ -8344,13 +8331,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB17_14: # %else12 ; SSE4-NEXT: pextrb $14, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: .LBB17_16: # %else14 ; SSE4-NEXT: retq ; @@ -8360,8 +8347,7 @@ ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_2 @@ -8372,7 +8358,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX-NEXT: .LBB17_4: # %else2 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8382,13 +8368,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_6 ; AVX-NEXT: # %bb.5: # %cond.store3 -; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB17_6: # %else4 ; AVX-NEXT: vpextrb $6, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_8 ; AVX-NEXT: # %bb.7: # %cond.store5 -; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX-NEXT: .LBB17_8: # %else6 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8398,13 +8384,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_10 ; AVX-NEXT: # %bb.9: # %cond.store7 -; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX-NEXT: .LBB17_10: # %else8 ; AVX-NEXT: vpextrb $10, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_12 ; AVX-NEXT: # %bb.11: # %cond.store9 -; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX-NEXT: .LBB17_12: # %else10 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 @@ -8414,13 +8400,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_14 ; AVX-NEXT: # %bb.13: # %cond.store11 -; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX-NEXT: .LBB17_14: # %else12 ; AVX-NEXT: vpextrb $14, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_16 ; AVX-NEXT: # %bb.15: # %cond.store13 -; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX-NEXT: .LBB17_16: # %else14 ; AVX-NEXT: retq ; @@ -8434,6 +8420,7 @@ ; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 ; AVX512F-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_2 @@ -8448,7 +8435,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB17_4: # %else2 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8461,7 +8448,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB17_6: # %else4 ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 @@ -8471,7 +8458,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB17_8: # %else6 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8484,7 +8471,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB17_10: # %else8 ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 @@ -8494,7 +8481,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB17_12: # %else10 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 @@ -8507,7 +8494,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB17_14: # %else12 ; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 @@ -8517,7 +8504,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB17_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -8526,11 +8513,11 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmw %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 ; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/masked_store_trunc_usat.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -914,6 +914,7 @@ ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm6 +; SSE2-NEXT: packuswb %xmm6, %xmm6 ; SSE2-NEXT: movd %xmm9, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm6, %eax @@ -927,25 +928,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB2_4: # %else2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm6, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB2_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm6, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB2_8: # %else6 ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -954,17 +955,16 @@ ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm6, %eax ; SSE2-NEXT: je .LBB2_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm6, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB2_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm6, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB2_12: # %else10 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 @@ -972,17 +972,16 @@ ; SSE2-NEXT: pxor %xmm5, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm6, %eax ; SSE2-NEXT: je .LBB2_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm6, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB2_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB2_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm6, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB2_16: # %else14 ; SSE2-NEXT: retq ; @@ -1021,6 +1020,7 @@ ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: packusdw %xmm7, %xmm6 ; SSE4-NEXT: packusdw %xmm6, %xmm1 +; SSE4-NEXT: packuswb %xmm1, %xmm1 ; SSE4-NEXT: pextrb $0, %xmm9, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_2 @@ -1031,7 +1031,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm1, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi) ; SSE4-NEXT: .LBB2_4: # %else2 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 @@ -1041,13 +1041,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm1, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi) ; SSE4-NEXT: .LBB2_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm1, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi) ; SSE4-NEXT: .LBB2_8: # %else6 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 @@ -1057,13 +1057,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm1, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm1, 4(%rdi) ; SSE4-NEXT: .LBB2_10: # %else8 ; SSE4-NEXT: pextrb $4, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm1, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm1, 5(%rdi) ; SSE4-NEXT: .LBB2_12: # %else10 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 @@ -1073,13 +1073,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm1, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm1, 6(%rdi) ; SSE4-NEXT: .LBB2_14: # %else12 ; SSE4-NEXT: pextrb $12, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB2_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm1, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm1, 7(%rdi) ; SSE4-NEXT: .LBB2_16: # %else14 ; SSE4-NEXT: retq ; @@ -1109,6 +1109,7 @@ ; AVX1-NEXT: vblendvpd %xmm11, %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm10, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_2 @@ -1121,7 +1122,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB2_4: # %else2 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 @@ -1131,7 +1132,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB2_6: # %else4 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 @@ -1139,7 +1140,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB2_8: # %else6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1150,13 +1151,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_10 ; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB2_10: # %else8 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_12 ; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX1-NEXT: .LBB2_12: # %else10 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -1166,13 +1167,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_14 ; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB2_14: # %else12 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_16 ; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX1-NEXT: .LBB2_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1185,17 +1186,24 @@ ; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm7, %ymm1, %ymm8 +; AVX2-NEXT: vpxor %ymm7, %ymm0, %ymm8 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] ; AVX2-NEXT: vpcmpgtq %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vblendvpd %ymm8, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpxor %ymm7, %ymm0, %ymm7 +; AVX2-NEXT: vblendvpd %ymm8, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpxor %ymm7, %ymm1, %ymm7 ; AVX2-NEXT: vpcmpgtq %ymm7, %ymm9, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpextrb $0, %xmm5, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_2 @@ -1208,7 +1216,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB2_4: # %else2 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 @@ -1218,7 +1226,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB2_6: # %else4 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 @@ -1226,7 +1234,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB2_8: # %else6 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1237,13 +1245,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_10 ; AVX2-NEXT: # %bb.9: # %cond.store7 -; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB2_10: # %else8 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_12 ; AVX2-NEXT: # %bb.11: # %cond.store9 -; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX2-NEXT: .LBB2_12: # %else10 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -1253,13 +1261,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_14 ; AVX2-NEXT: # %bb.13: # %cond.store11 -; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB2_14: # %else12 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_16 ; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX2-NEXT: .LBB2_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1269,7 +1277,7 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_2 @@ -1282,7 +1290,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB2_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -1290,7 +1298,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB2_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -1298,7 +1306,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB2_8: # %else6 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $4, %k0, %k0 @@ -1306,7 +1314,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB2_10: # %else8 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $5, %k0, %k0 @@ -1314,7 +1322,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB2_12: # %else10 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $6, %k0, %k0 @@ -1322,7 +1330,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB2_14: # %else12 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $7, %k0, %k0 @@ -1330,7 +1338,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB2_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB2_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1570,7 +1578,7 @@ ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] ; SSE2-NEXT: movdqa %xmm9, %xmm7 @@ -1581,56 +1589,60 @@ ; SSE2-NEXT: pand %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: .LBB4_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pextrw $2, %xmm3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB4_4: # %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: .LBB4_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB4_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) ; SSE2-NEXT: .LBB4_8: # %else6 ; SSE2-NEXT: retq @@ -1640,49 +1652,53 @@ ; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm0, %xmm6 -; SSE4-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] +; SSE4-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE4-NEXT: pxor %xmm0, %xmm5 +; SSE4-NEXT: movapd {{.*#+}} xmm6 = [65535,65535] ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm3 +; SSE4-NEXT: movdqa %xmm8, %xmm3 ; SSE4-NEXT: pxor %xmm7, %xmm3 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movapd %xmm5, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE4-NEXT: pxor %xmm8, %xmm7 +; SSE4-NEXT: movapd %xmm6, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE4-NEXT: pxor %xmm1, %xmm7 ; SSE4-NEXT: pcmpgtq %xmm7, %xmm4 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE4-NEXT: packusdw %xmm3, %xmm5 -; SSE4-NEXT: pextrb $0, %xmm6, %eax +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE4-NEXT: pextrb $0, %xmm5, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm5, (%rdi) +; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE4-NEXT: .LBB4_2: # %else -; SSE4-NEXT: pextrb $4, %xmm6, %eax +; SSE4-NEXT: pextrb $4, %xmm5, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $2, %xmm5, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB4_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: pextrb $8, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $4, %xmm5, 4(%rdi) +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB4_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax +; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB4_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $6, %xmm5, 6(%rdi) +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB4_8: # %else6 ; SSE4-NEXT: retq ; @@ -1701,8 +1717,12 @@ ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535] ; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_2 @@ -1713,7 +1733,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB4_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -1723,13 +1743,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB4_6: # %else4 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB4_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB4_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1747,7 +1767,11 @@ ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_2 @@ -1758,7 +1782,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB4_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -1768,13 +1792,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB4_6: # %else4 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB4_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB4_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1786,7 +1810,7 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_2 @@ -1799,7 +1823,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB4_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -1807,7 +1831,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB4_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -1815,7 +1839,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB4_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB4_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1825,12 +1849,11 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1856,68 +1879,72 @@ ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] ; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: packuswb %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm5, %eax ; SSE2-NEXT: je .LBB5_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB5_2: # %else ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB5_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB5_6: # %else4 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB5_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm1, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB5_8: # %else6 ; SSE2-NEXT: retq @@ -1929,20 +1956,23 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE4-NEXT: pxor %xmm0, %xmm6 -; SSE4-NEXT: movapd {{.*#+}} xmm5 = [255,255] -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: pxor %xmm7, %xmm3 +; SSE4-NEXT: movapd {{.*#+}} xmm7 = [255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm8, %xmm5 +; SSE4-NEXT: pxor %xmm3, %xmm5 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movapd %xmm5, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE4-NEXT: pxor %xmm8, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm4 -; SSE4-NEXT: movdqa %xmm4, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 +; SSE4-NEXT: movapd %xmm7, %xmm5 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE4-NEXT: packusdw %xmm3, %xmm5 +; SSE4-NEXT: pxor %xmm1, %xmm3 +; SSE4-NEXT: pcmpgtq %xmm3, %xmm4 +; SSE4-NEXT: movdqa %xmm4, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE4-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE4-NEXT: pshufb %xmm0, %xmm7 +; SSE4-NEXT: pshufb %xmm0, %xmm5 +; SSE4-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE4-NEXT: pextrb $0, %xmm6, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_2 @@ -1953,7 +1983,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $4, %xmm5, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm5, 1(%rdi) ; SSE4-NEXT: .LBB5_4: # %else2 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 @@ -1963,13 +1993,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $8, %xmm5, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm5, 2(%rdi) ; SSE4-NEXT: .LBB5_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB5_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $12, %xmm5, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm5, 3(%rdi) ; SSE4-NEXT: .LBB5_8: # %else6 ; SSE4-NEXT: retq ; @@ -1988,8 +2018,11 @@ ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [255,255] ; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_2 @@ -2000,7 +2033,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB5_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -2010,13 +2043,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB5_6: # %else4 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB5_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB5_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2034,7 +2067,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_2 @@ -2045,7 +2081,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB5_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -2055,13 +2091,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB5_6: # %else4 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB5_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB5_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2073,7 +2109,7 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] ; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_2 @@ -2086,7 +2122,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB5_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -2094,7 +2130,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB5_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -2102,7 +2138,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB5_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB5_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2112,12 +2148,11 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2151,27 +2186,28 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB6_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, (%rdi) +; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: .LBB6_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB6_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 4(%rdi) ; SSE2-NEXT: .LBB6_4: # %else2 ; SSE2-NEXT: retq @@ -2181,25 +2217,26 @@ ; SSE4-NEXT: movdqa %xmm0, %xmm2 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm3 -; SSE4-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: pxor %xmm2, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372041149743103,9223372041149743103] ; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm3, %eax +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB6_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: movss %xmm1, (%rdi) +; SSE4-NEXT: movd %xmm0, (%rdi) ; SSE4-NEXT: .LBB6_2: # %else -; SSE4-NEXT: pextrb $8, %xmm3, %eax +; SSE4-NEXT: pextrb $8, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB6_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: extractps $2, %xmm1, 4(%rdi) +; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB6_4: # %else2 ; SSE4-NEXT: retq ; @@ -2209,12 +2246,12 @@ ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295] ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq @@ -2225,12 +2262,12 @@ ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295] ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq @@ -2240,11 +2277,11 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] ; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: kshiftlw $14, %k0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2261,11 +2298,11 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2291,28 +2328,30 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB7_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: .LBB7_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB7_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $4, %xmm2, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB7_4: # %else2 ; SSE2-NEXT: retq @@ -2322,25 +2361,27 @@ ; SSE4-NEXT: movdqa %xmm0, %xmm2 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm3 -; SSE4-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: pxor %xmm2, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854841343,9223372036854841343] ; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm3, %eax +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE4-NEXT: pextrb $0, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB7_2 ; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) +; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE4-NEXT: .LBB7_2: # %else -; SSE4-NEXT: pextrb $8, %xmm3, %eax +; SSE4-NEXT: pextrb $8, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB7_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $4, %xmm1, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB7_4: # %else2 ; SSE4-NEXT: retq ; @@ -2355,6 +2396,8 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; AVX-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpextrb $0, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB7_2 @@ -2365,7 +2408,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB7_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB7_4: # %else2 ; AVX-NEXT: retq ; @@ -2376,6 +2419,8 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535] ; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB7_2 @@ -2388,7 +2433,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB7_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB7_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2398,12 +2443,12 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $30, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: kshiftld $30, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2429,36 +2474,40 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: je .LBB8_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB8_2: # %else ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB8_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; @@ -2475,6 +2524,7 @@ ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854776063,9223372036854776063] ; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE4-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pextrb $0, %xmm3, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB8_2 @@ -2485,7 +2535,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB8_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $8, %xmm1, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi) ; SSE4-NEXT: .LBB8_4: # %else2 ; SSE4-NEXT: retq ; @@ -2500,6 +2550,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] ; AVX-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB8_2 @@ -2510,7 +2561,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB8_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi) +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX-NEXT: .LBB8_4: # %else2 ; AVX-NEXT: retq ; @@ -2521,6 +2572,7 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] ; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB8_2 @@ -2533,7 +2585,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB8_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB8_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2543,11 +2595,11 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $62, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: kshiftlq $62, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4673,58 +4725,59 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm4, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm9 ; SSE2-NEXT: packssdw %xmm0, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm10, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm8, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 -; SSE2-NEXT: packuswb %xmm4, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm4, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 ; SSE2-NEXT: movd %xmm9, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm10, %eax +; SSE2-NEXT: movd %xmm5, %eax ; SSE2-NEXT: je .LBB12_2 ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB12_2: # %else -; SSE2-NEXT: psrlq $16, %xmm5 -; SSE2-NEXT: movd %xmm5, %ecx +; SSE2-NEXT: psrlq $16, %xmm10 +; SSE2-NEXT: movd %xmm10, %ecx ; SSE2-NEXT: shrl $16, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB12_4: # %else2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm10, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB12_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm10, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB12_8: # %else6 ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -4733,17 +4786,16 @@ ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm5, %eax ; SSE2-NEXT: je .LBB12_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm10, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB12_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm10, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB12_12: # %else10 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 @@ -4751,17 +4803,16 @@ ; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm5, %eax ; SSE2-NEXT: je .LBB12_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm10, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB12_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB12_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm10, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB12_16: # %else14 ; SSE2-NEXT: retq ; @@ -4772,9 +4823,12 @@ ; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE4-NEXT: pxor %xmm5, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255] -; SSE4-NEXT: pminud %xmm5, %xmm1 ; SSE4-NEXT: pminud %xmm5, %xmm0 -; SSE4-NEXT: packusdw %xmm1, %xmm0 +; SSE4-NEXT: pminud %xmm5, %xmm1 +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE4-NEXT: pshufb %xmm5, %xmm1 +; SSE4-NEXT: pshufb %xmm5, %xmm0 +; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE4-NEXT: pextrb $0, %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_2 @@ -4785,7 +4839,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB12_4: # %else2 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 @@ -4795,13 +4849,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB12_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB12_8: # %else6 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 @@ -4811,13 +4865,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB12_10: # %else8 ; SSE4-NEXT: pextrb $4, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) ; SSE4-NEXT: .LBB12_12: # %else10 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 @@ -4827,13 +4881,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB12_14: # %else12 ; SSE4-NEXT: pextrb $12, %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB12_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: .LBB12_16: # %else14 ; SSE4-NEXT: retq ; @@ -4843,11 +4897,14 @@ ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpminud %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX1-NEXT: vpextrb $0, %xmm4, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_2 @@ -4860,7 +4917,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB12_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 @@ -4870,7 +4927,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB12_6: # %else4 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 @@ -4878,7 +4935,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB12_8: # %else6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4889,13 +4946,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_10 ; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB12_10: # %else8 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_12 ; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX1-NEXT: .LBB12_12: # %else10 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -4905,13 +4962,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_14 ; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB12_14: # %else12 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB12_16 ; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX1-NEXT: .LBB12_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -4925,7 +4982,10 @@ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminud %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX2-NEXT: vpextrb $0, %xmm4, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_2 @@ -4938,7 +4998,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB12_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 @@ -4948,7 +5008,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB12_6: # %else4 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 @@ -4956,7 +5016,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB12_8: # %else6 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4967,13 +5027,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_10 ; AVX2-NEXT: # %bb.9: # %cond.store7 -; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB12_10: # %else8 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_12 ; AVX2-NEXT: # %bb.11: # %cond.store9 -; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX2-NEXT: .LBB12_12: # %else10 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -4983,13 +5043,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_14 ; AVX2-NEXT: # %bb.13: # %cond.store11 -; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB12_14: # %else12 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB12_16 ; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX2-NEXT: .LBB12_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5000,7 +5060,7 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminud %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_2 @@ -5013,7 +5073,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB12_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -5021,7 +5081,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB12_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -5029,7 +5089,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB12_8: # %else6 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $4, %k0, %k0 @@ -5037,7 +5097,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB12_10: # %else8 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $5, %k0, %k0 @@ -5045,7 +5105,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB12_12: # %else10 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $6, %k0, %k0 @@ -5053,7 +5113,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB12_14: # %else12 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $7, %k0, %k0 @@ -5061,7 +5121,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB12_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB12_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5070,12 +5130,11 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5098,49 +5157,52 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, <4 x i16>* %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i32_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: .LBB13_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) ; SSE2-NEXT: .LBB13_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pextrw $4, %xmm3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm2, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: .LBB13_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pextrw $6, %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB13_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm2, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) ; SSE2-NEXT: .LBB13_8: # %else6 ; SSE2-NEXT: retq @@ -5152,6 +5214,7 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 ; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: packusdw %xmm0, %xmm0 ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_2 @@ -5162,7 +5225,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB13_4: # %else2 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 @@ -5172,13 +5235,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB13_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB13_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB13_8: # %else6 ; SSE4-NEXT: retq ; @@ -5189,6 +5252,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB13_2 @@ -5199,7 +5263,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB13_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB13_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5209,13 +5273,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB13_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX1-NEXT: .LBB13_6: # %else4 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB13_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX1-NEXT: .LBB13_8: # %else6 ; AVX1-NEXT: retq ; @@ -5227,6 +5291,7 @@ ; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB13_2 @@ -5237,7 +5302,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB13_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB13_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5247,13 +5312,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB13_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX2-NEXT: .LBB13_6: # %else4 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB13_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX2-NEXT: .LBB13_8: # %else6 ; AVX2-NEXT: retq ; @@ -5263,6 +5328,7 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535] ; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_2 @@ -5275,7 +5341,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB13_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -5283,7 +5349,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB13_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -5291,7 +5357,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB13_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB13_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5300,11 +5366,11 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $28, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] ; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: kshiftld $28, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5326,49 +5392,53 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, <4 x i8>* %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i32_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm5, %eax ; SSE2-NEXT: je .LBB14_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: .LBB14_2: # %else ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB14_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB14_6: # %else4 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB14_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $6, %xmm2, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB14_8: # %else6 ; SSE2-NEXT: retq @@ -5380,6 +5450,7 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 ; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_2 @@ -5390,7 +5461,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB14_4: # %else2 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 @@ -5400,13 +5471,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB14_6: # %else4 ; SSE4-NEXT: pextrb $12, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB14_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB14_8: # %else6 ; SSE4-NEXT: retq ; @@ -5417,6 +5488,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_2 @@ -5427,7 +5499,7 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_4 ; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX1-NEXT: .LBB14_4: # %else2 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5437,13 +5509,13 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_6 ; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX1-NEXT: .LBB14_6: # %else4 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB14_8 ; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX1-NEXT: .LBB14_8: # %else6 ; AVX1-NEXT: retq ; @@ -5455,6 +5527,7 @@ ; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255] ; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_2 @@ -5465,7 +5538,7 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_4 ; AVX2-NEXT: # %bb.3: # %cond.store1 -; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX2-NEXT: .LBB14_4: # %else2 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 @@ -5475,13 +5548,13 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_6 ; AVX2-NEXT: # %bb.5: # %cond.store3 -; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX2-NEXT: .LBB14_6: # %else4 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB14_8 ; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX2-NEXT: .LBB14_8: # %else6 ; AVX2-NEXT: retq ; @@ -5491,6 +5564,7 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] ; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_2 @@ -5503,7 +5577,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB14_4: # %else2 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $2, %k0, %k0 @@ -5511,7 +5585,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB14_6: # %else4 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftrw $3, %k0, %k0 @@ -5519,7 +5593,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB14_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB14_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5528,11 +5602,11 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] ; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7898,10 +7972,10 @@ ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax @@ -7913,25 +7987,25 @@ ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: .LBB17_4: # %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) ; SSE2-NEXT: .LBB17_6: # %else4 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: movb %al, 3(%rdi) ; SSE2-NEXT: .LBB17_8: # %else6 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -7940,17 +8014,16 @@ ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $2, %xmm0, %eax ; SSE2-NEXT: je .LBB17_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 4(%rdi) ; SSE2-NEXT: .LBB17_10: # %else8 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: movb %ah, 5(%rdi) ; SSE2-NEXT: .LBB17_12: # %else10 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm2, %xmm1 @@ -7958,17 +8031,16 @@ ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: je .LBB17_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 6(%rdi) ; SSE2-NEXT: .LBB17_14: # %else12 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je .LBB17_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: movb %ah, 7(%rdi) ; SSE2-NEXT: .LBB17_16: # %else14 ; SSE2-NEXT: retq ; @@ -7979,6 +8051,7 @@ ; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm2 ; SSE4-NEXT: pminuw {{.*}}(%rip), %xmm0 +; SSE4-NEXT: packuswb %xmm0, %xmm0 ; SSE4-NEXT: pextrb $0, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_2 @@ -7989,7 +8062,7 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_4 ; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) ; SSE4-NEXT: .LBB17_4: # %else2 ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 @@ -7999,13 +8072,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_6 ; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) ; SSE4-NEXT: .LBB17_6: # %else4 ; SSE4-NEXT: pextrb $6, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_8 ; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: .LBB17_8: # %else6 ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 @@ -8015,13 +8088,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_10 ; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) ; SSE4-NEXT: .LBB17_10: # %else8 ; SSE4-NEXT: pextrb $10, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_12 ; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) ; SSE4-NEXT: .LBB17_12: # %else10 ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqw %xmm2, %xmm1 @@ -8031,13 +8104,13 @@ ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_14 ; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) ; SSE4-NEXT: .LBB17_14: # %else12 ; SSE4-NEXT: pextrb $14, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je .LBB17_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: .LBB17_16: # %else14 ; SSE4-NEXT: retq ; @@ -8048,6 +8121,7 @@ ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_2 @@ -8058,7 +8132,7 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_4 ; AVX-NEXT: # %bb.3: # %cond.store1 -; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX-NEXT: .LBB17_4: # %else2 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8068,13 +8142,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_6 ; AVX-NEXT: # %bb.5: # %cond.store3 -; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX-NEXT: .LBB17_6: # %else4 ; AVX-NEXT: vpextrb $6, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_8 ; AVX-NEXT: # %bb.7: # %cond.store5 -; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX-NEXT: .LBB17_8: # %else6 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8084,13 +8158,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_10 ; AVX-NEXT: # %bb.9: # %cond.store7 -; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX-NEXT: .LBB17_10: # %else8 ; AVX-NEXT: vpextrb $10, %xmm2, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_12 ; AVX-NEXT: # %bb.11: # %cond.store9 -; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX-NEXT: .LBB17_12: # %else10 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 @@ -8100,13 +8174,13 @@ ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_14 ; AVX-NEXT: # %bb.13: # %cond.store11 -; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX-NEXT: .LBB17_14: # %else12 ; AVX-NEXT: vpextrb $14, %xmm1, %eax ; AVX-NEXT: testb $1, %al ; AVX-NEXT: je .LBB17_16 ; AVX-NEXT: # %bb.15: # %cond.store13 -; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX-NEXT: .LBB17_16: # %else14 ; AVX-NEXT: retq ; @@ -8119,6 +8193,7 @@ ; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 ; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 ; AVX512F-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_2 @@ -8133,7 +8208,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_4 ; AVX512F-NEXT: # %bb.3: # %cond.store1 -; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) +; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) ; AVX512F-NEXT: .LBB17_4: # %else2 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8146,7 +8221,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_6 ; AVX512F-NEXT: # %bb.5: # %cond.store3 -; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) +; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) ; AVX512F-NEXT: .LBB17_6: # %else4 ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 @@ -8156,7 +8231,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_8 ; AVX512F-NEXT: # %bb.7: # %cond.store5 -; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) +; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) ; AVX512F-NEXT: .LBB17_8: # %else6 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 @@ -8169,7 +8244,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_10 ; AVX512F-NEXT: # %bb.9: # %cond.store7 -; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) +; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) ; AVX512F-NEXT: .LBB17_10: # %else8 ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 @@ -8179,7 +8254,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_12 ; AVX512F-NEXT: # %bb.11: # %cond.store9 -; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) +; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) ; AVX512F-NEXT: .LBB17_12: # %else10 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 @@ -8192,7 +8267,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_14 ; AVX512F-NEXT: # %bb.13: # %cond.store11 -; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) ; AVX512F-NEXT: .LBB17_14: # %else12 ; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 @@ -8202,7 +8277,7 @@ ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: je .LBB17_16 ; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) ; AVX512F-NEXT: .LBB17_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -8211,10 +8286,10 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmw %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll =================================================================== --- llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll +++ llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll @@ -22,13 +22,12 @@ define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind { ; X86-64-LABEL: t4: ; X86-64: ## %bb.0: -; X86-64-NEXT: movdq2q %xmm1, %mm0 -; X86-64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) ; X86-64-NEXT: movdq2q %xmm0, %mm0 ; X86-64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X86-64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-64-NEXT: paddb %xmm1, %xmm0 +; X86-64-NEXT: movdq2q %xmm1, %mm0 +; X86-64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) +; X86-64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 +; X86-64-NEXT: paddb -{{[0-9]+}}(%rsp), %xmm0 ; X86-64-NEXT: movb $1, %al ; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL %v1a = bitcast x86_mmx %v1 to <8 x i8> Index: llvm/test/CodeGen/X86/mmx-arith.ll =================================================================== --- llvm/test/CodeGen/X86/mmx-arith.ll +++ llvm/test/CodeGen/X86/mmx-arith.ll @@ -13,8 +13,8 @@ ; X32-NEXT: .cfi_offset %ebp, -8 ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $16, %esp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp ; X32-NEXT: movl 12(%ebp), %ecx ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -26,7 +26,7 @@ ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: paddusb (%ecx), %mm0 ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: psubb %xmm1, %xmm0 @@ -36,37 +36,24 @@ ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: psubusb (%ecx), %mm0 ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-NEXT: movdqa (%esp), %xmm0 ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-NEXT: pmullw %xmm0, %xmm1 -; X32-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; X32-NEXT: movdqa %xmm1, %xmm2 -; X32-NEXT: pand %xmm0, %xmm2 -; X32-NEXT: packuswb %xmm2, %xmm2 -; X32-NEXT: movq %xmm2, (%eax) -; X32-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-NEXT: pand %xmm1, %xmm2 -; X32-NEXT: movdqa %xmm2, %xmm1 +; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-NEXT: pmullw %xmm1, %xmm0 +; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-NEXT: packuswb %xmm0, %xmm0 +; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: pand %xmm0, %xmm1 -; X32-NEXT: packuswb %xmm1, %xmm1 ; X32-NEXT: movq %xmm1, (%eax) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: por %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-NEXT: por %xmm2, %xmm1 -; X32-NEXT: movdqa %xmm1, %xmm2 -; X32-NEXT: pand %xmm0, %xmm2 -; X32-NEXT: packuswb %xmm2, %xmm2 -; X32-NEXT: movq %xmm2, (%eax) -; X32-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-NEXT: pxor %xmm1, %xmm2 -; X32-NEXT: pand %xmm0, %xmm2 -; X32-NEXT: packuswb %xmm2, %xmm2 -; X32-NEXT: movq %xmm2, (%eax) +; X32-NEXT: pxor %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: emms ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -84,7 +71,7 @@ ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: paddusb (%rsi), %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: psubb %xmm1, %xmm0 @@ -94,37 +81,24 @@ ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: psubusb (%rsi), %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: packuswb %xmm2, %xmm2 -; X64-NEXT: movq %xmm2, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-NEXT: pand %xmm1, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: pmullw %xmm1, %xmm0 +; X64-NEXT: pand {{.*}}(%rip), %xmm0 +; X64-NEXT: packuswb %xmm0, %xmm0 +; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: packuswb %xmm1, %xmm1 ; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-NEXT: por %xmm2, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: packuswb %xmm2, %xmm2 -; X64-NEXT: movq %xmm2, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-NEXT: pxor %xmm1, %xmm2 -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: packuswb %xmm2, %xmm2 -; X64-NEXT: movq %xmm2, (%rdi) +; X64-NEXT: pxor %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq entry: @@ -182,66 +156,56 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) { ; X32-LABEL: test1: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: paddq %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: pmuludq %xmm1, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: andps %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: orps %xmm1, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: xorps %xmm0, %xmm1 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: paddd %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X32-NEXT: pmuludq %xmm0, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-NEXT: pmuludq %xmm0, %xmm2 +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pand %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: por %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pxor %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%ecx) ; X32-NEXT: emms ; X32-NEXT: retl ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: paddq %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: pmuludq %xmm1, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; X64-NEXT: paddd %xmm0, %xmm1 ; X64-NEXT: movq %xmm1, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X64-NEXT: pmuludq %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pand %xmm1, %xmm0 +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: pxor %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq @@ -294,8 +258,8 @@ ; X32-NEXT: .cfi_offset %ebp, -8 ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $24, %esp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $64, %esp ; X32-NEXT: movl 12(%ebp), %ecx ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -307,7 +271,7 @@ ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: paddusw (%ecx), %mm0 ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: psubw %xmm1, %xmm0 @@ -317,40 +281,25 @@ ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: psubusw (%ecx), %mm0 ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movq %mm0, (%eax) -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: pmullw %xmm0, %xmm1 -; X32-NEXT: movdq2q %xmm1, %mm0 -; X32-NEXT: movq %xmm1, (%eax) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pmullw {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: movdq2q %xmm0, %mm0 +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: pmulhw (%ecx), %mm0 ; X32-NEXT: movq %mm0, (%eax) ; X32-NEXT: pmaddwd (%ecx), %mm0 ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X32-NEXT: movq %mm0, (%eax) -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-NEXT: pand %xmm0, %xmm1 -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-NEXT: por %xmm1, %xmm0 -; X32-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; X32-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-NEXT: pxor %xmm0, %xmm1 -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: andps (%esp), %xmm0 +; X32-NEXT: movlps %xmm0, (%eax) +; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: orps %xmm0, %xmm1 +; X32-NEXT: movlps %xmm1, (%eax) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: xorps %xmm1, %xmm0 +; X32-NEXT: movlps %xmm0, (%eax) ; X32-NEXT: emms ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -368,7 +317,7 @@ ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: paddusw (%rsi), %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: psubw %xmm1, %xmm0 @@ -378,40 +327,25 @@ ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: psubusw (%rsi), %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pmullw -{{[0-9]+}}(%rsp), %xmm0 +; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: pmulhw (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: pmaddwd (%rsi), %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm1, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-NEXT: pxor %xmm0, %xmm1 -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: andps -{{[0-9]+}}(%rsp), %xmm0 +; X64-NEXT: movlps %xmm0, (%rdi) +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: orps %xmm0, %xmm1 +; X64-NEXT: movlps %xmm1, (%rdi) +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: xorps %xmm1, %xmm0 +; X64-NEXT: movlps %xmm0, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq entry: @@ -479,45 +413,34 @@ ; X32-LABEL: test3: ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $16, %esp -; X32-NEXT: cmpl $0, 16(%ebp) +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: je .LBB3_1 ; X32-NEXT: # %bb.2: # %bb26.preheader +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: xorl %ebx, %ebx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_3: # %bb26 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl 8(%ebp), %ecx -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl (%ecx,%ebx,8), %ecx -; X32-NEXT: movl 4(%esi,%ebx,8), %esi -; X32-NEXT: movl 12(%ebp), %edi -; X32-NEXT: addl (%edi,%ebx,8), %ecx -; X32-NEXT: adcl 4(%edi,%ebx,8), %esi -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: movl %ecx, (%esp) -; X32-NEXT: adcl %edx, %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movd %xmm0, %eax -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X32-NEXT: movd %xmm0, %edx +; X32-NEXT: movl (%edi,%ebx,8), %ebp +; X32-NEXT: movl 4(%edi,%ebx,8), %ecx +; X32-NEXT: addl (%esi,%ebx,8), %ebp +; X32-NEXT: adcl 4(%esi,%ebx,8), %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: incl %ebx -; X32-NEXT: cmpl 16(%ebp), %ebx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: jb .LBB3_3 ; X32-NEXT: jmp .LBB3_4 ; X32-NEXT: .LBB3_1: ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .LBB3_4: # %bb31 -; X32-NEXT: leal -12(%ebp), %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/mmx-cvt.ll =================================================================== --- llvm/test/CodeGen/X86/mmx-cvt.ll +++ llvm/test/CodeGen/X86/mmx-cvt.ll @@ -296,8 +296,8 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $32, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movq (%eax), %mm0 ; X86-NEXT: paddd %mm0, %mm0 Index: llvm/test/CodeGen/X86/mulvi32.ll =================================================================== --- llvm/test/CodeGen/X86/mulvi32.ll +++ llvm/test/CodeGen/X86/mulvi32.ll @@ -7,36 +7,39 @@ ; PR6399 define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) { -; SSE-LABEL: _mul2xi32a: -; SSE: # %bb.0: -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _mul2xi32a: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul2xi32a: +; SSE42: # %bb.0: +; SSE42-NEXT: pmulld %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX-LABEL: _mul2xi32a: ; AVX: # %bb.0: -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = mul <2 x i32> %0, %1 ret <2 x i32> %r } define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) { -; SSE2-LABEL: _mul2xi32b: -; SSE2: # %bb.0: -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: retq -; -; SSE42-LABEL: _mul2xi32b: -; SSE42: # %bb.0: -; SSE42-NEXT: pmuludq %xmm1, %xmm0 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: retq +; SSE-LABEL: _mul2xi32b: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: _mul2xi32b: ; AVX: # %bb.0: ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %factor0 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> %factor1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> Index: llvm/test/CodeGen/X86/oddshuffles.ll =================================================================== --- llvm/test/CodeGen/X86/oddshuffles.ll +++ llvm/test/CodeGen/X86/oddshuffles.ll @@ -68,7 +68,7 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind { ; SSE2-LABEL: v3i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movd %xmm2, 8(%rdi) ; SSE2-NEXT: movq %xmm0, (%rdi) @@ -76,7 +76,7 @@ ; ; SSE42-LABEL: v3i32: ; SSE42: # %bb.0: -; SSE42-NEXT: extractps $2, %xmm0, 8(%rdi) +; SSE42-NEXT: extractps $1, %xmm0, 8(%rdi) ; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE42-NEXT: movlps %xmm0, (%rdi) ; SSE42-NEXT: retq @@ -84,14 +84,14 @@ ; AVX-LABEL: v3i32: ; AVX: # %bb.0: ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi) ; AVX-NEXT: vmovlps %xmm1, (%rdi) ; AVX-NEXT: retq ; ; XOP-LABEL: v3i32: ; XOP: # %bb.0: ; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi) +; XOP-NEXT: vextractps $1, %xmm0, 8(%rdi) ; XOP-NEXT: vmovlps %xmm1, (%rdi) ; XOP-NEXT: retq %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> @@ -102,58 +102,34 @@ define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind { ; SSE2-LABEL: v5i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movw %ax, 8(%rdi) -; SSE2-NEXT: movq %xmm2, (%rdi) +; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v5i16: ; SSE42: # %bb.0: -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE42-NEXT: pextrw $6, %xmm0, 8(%rdi) -; SSE42-NEXT: movq %xmm2, (%rdi) +; SSE42-NEXT: psrlq $16, %xmm1 +; SSE42-NEXT: pextrw $3, %xmm0, 8(%rdi) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: movq %xmm0, (%rdi) ; SSE42-NEXT: retq ; -; AVX1-LABEL: v5i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi) -; AVX1-NEXT: vmovq %xmm1, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: v5i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi) -; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: v5i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi) -; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi) -; AVX2-FAST-NEXT: retq +; AVX-LABEL: v5i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpextrw $3, %xmm0, 8(%rdi) +; AVX-NEXT: vmovq %xmm1, (%rdi) +; AVX-NEXT: retq ; ; XOP-LABEL: v5i16: ; XOP: # %bb.0: -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[4,5],xmm0[4,5],xmm1[8,9],xmm0[4,5],xmm1[4,5],xmm0[6,7],xmm1[6,7] -; XOP-NEXT: vpextrw $6, %xmm0, 8(%rdi) +; XOP-NEXT: vpsrlq $16, %xmm1, %xmm1 +; XOP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; XOP-NEXT: vpextrw $3, %xmm0, 8(%rdi) ; XOP-NEXT: vmovq %xmm1, (%rdi) ; XOP-NEXT: retq %r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> @@ -251,42 +227,35 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind { ; SSE2-LABEL: v7i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,255,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,0,4,5,6,7] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb %al, 6(%rdi) -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: movd %xmm2, (%rdi) +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v7i8: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] ; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi) -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi) ; SSE42-NEXT: movd %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX-LABEL: v7i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi) ; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX-NEXT: vmovd %xmm0, (%rdi) @@ -294,7 +263,7 @@ ; ; XOP-LABEL: v7i8: ; XOP: # %bb.0: -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,u,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[3],xmm1[2],xmm0[1],xmm1[3,0,u,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi) ; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; XOP-NEXT: vmovd %xmm0, (%rdi) @@ -307,36 +276,32 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind { ; SSE2-LABEL: v7i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7] ; SSE2-NEXT: movw %ax, 12(%rdi) -; SSE2-NEXT: movq %xmm2, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v7i16: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] ; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi) -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15] ; SSE42-NEXT: pextrd $2, %xmm1, 8(%rdi) ; SSE42-NEXT: movq %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX-LABEL: v7i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15] ; AVX-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; AVX-NEXT: vmovq %xmm0, (%rdi) @@ -344,7 +309,7 @@ ; ; XOP-LABEL: v7i16: ; XOP: # %bb.0: -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[4,5],xmm0[6,7],xmm1[4,5],xmm0[2,3],xmm1[6,7,0,1],xmm0[6,7] ; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; XOP-NEXT: vmovq %xmm0, (%rdi) @@ -410,19 +375,19 @@ define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind { ; SSE2-LABEL: v12i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4] -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,1,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,3] ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movq %xmm2, (%rdi) @@ -432,27 +397,23 @@ ; ; SSE42-LABEL: v12i8: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u] -; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi) ; SSE42-NEXT: movq %xmm0, (%rdi) ; SSE42-NEXT: retq ; ; AVX-LABEL: v12i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; ; XOP-LABEL: v12i8: ; XOP: # %bb.0: -; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u] -; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u] -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm1[0],xmm0[1,5],xmm1[1],xmm0[2,6],xmm1[2],xmm0[3,7],xmm1[3],xmm0[u,u,u,u] ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; XOP-NEXT: vmovq %xmm0, (%rdi) ; XOP-NEXT: retq @@ -659,11 +620,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounwind { ; SSE2-LABEL: pr29025: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] @@ -675,9 +632,6 @@ ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255] ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] ; SSE2-NEXT: pandn %xmm2, %xmm1 @@ -689,11 +643,7 @@ ; ; SSE42-LABEL: pr29025: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 -; SSE42-NEXT: pshufb %xmm3, %xmm0 ; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi) @@ -702,12 +652,8 @@ ; ; AVX-LABEL: pr29025: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; AVX-NEXT: vmovq %xmm0, (%rdi) @@ -715,8 +661,8 @@ ; ; XOP-LABEL: pr29025: ; XOP: # %bb.0: -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4,8,12],xmm1[0,4,8,12],xmm0[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[4],xmm0[2,6],xmm2[8],xmm0[3,7],xmm2[12],xmm0[u,u,u,u] +; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[1],xmm0[2,6],xmm2[2],xmm0[3,7],xmm2[3],xmm0[u,u,u,u] ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; XOP-NEXT: vmovq %xmm0, (%rdi) ; XOP-NEXT: retq @@ -751,39 +697,39 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm0, %xmm4 -; SSE2-NEXT: movq %xmm4, (%rsi) -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; SSE2-NEXT: packuswb %xmm0, %xmm4 -; SSE2-NEXT: movq %xmm4, (%rdx) -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] +; SSE2-NEXT: packuswb %xmm0, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm6 ; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm4, (%rsi) +; SSE2-NEXT: movq %xmm5, (%rdx) ; SSE2-NEXT: movq %xmm0, (%rcx) ; SSE2-NEXT: retq ; @@ -796,16 +742,16 @@ ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm2, %xmm3 -; SSE42-NEXT: movq %xmm3, (%rsi) ; SSE42-NEXT: movdqa %xmm1, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm2, %xmm3 -; SSE42-NEXT: movq %xmm3, (%rdx) +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm2, %xmm4 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: movq %xmm3, (%rsi) +; SSE42-NEXT: movq %xmm4, (%rdx) ; SSE42-NEXT: movq %xmm0, (%rcx) ; SSE42-NEXT: retq ; @@ -816,14 +762,14 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovq %xmm2, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm2, (%rsi) +; AVX-NEXT: vmovq %xmm3, (%rdx) ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -834,14 +780,14 @@ ; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vmovq %xmm2, (%rsi) -; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vmovq %xmm2, (%rdx) +; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; XOP-NEXT: vpor %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vmovq %xmm2, (%rsi) +; XOP-NEXT: vmovq %xmm3, (%rdx) ; XOP-NEXT: vmovq %xmm0, (%rcx) ; XOP-NEXT: retq %wide.vec = load <24 x i8>, <24 x i8>* %p, align 4 Index: llvm/test/CodeGen/X86/oddsubvector.ll =================================================================== --- llvm/test/CodeGen/X86/oddsubvector.ll +++ llvm/test/CodeGen/X86/oddsubvector.ll @@ -12,19 +12,12 @@ ; SSE2-LABEL: insert_v7i8_v2i16_2: ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: movd %xmm1, (%rdi) -; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: retq ; @@ -32,52 +25,40 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; SSE42-NEXT: packuswb %xmm0, %xmm0 ; SSE42-NEXT: pextrb $6, %xmm1, 6(%rdi) -; SSE42-NEXT: pextrw $2, %xmm0, 4(%rdi) -; SSE42-NEXT: movd %xmm0, (%rdi) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE42-NEXT: pextrw $1, %xmm0, 4(%rdi) +; SSE42-NEXT: movd %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: insert_v7i8_v2i16_2: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: vmovd %xmm0, (%rdi) +; AVX1-NEXT: vpextrw $1, %xmm0, 4(%rdi) +; AVX1-NEXT: vmovd %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_v7i8_v2i16_2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3] -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: vpextrw $1, %xmm0, 4(%rdi) +; AVX2-NEXT: vmovd %xmm2, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: insert_v7i8_v2i16_2: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3] -; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512-NEXT: vmovd %xmm0, (%rdi) +; AVX512-NEXT: vpextrw $1, %xmm0, 4(%rdi) +; AVX512-NEXT: vmovd %xmm2, (%rdi) ; AVX512-NEXT: retq ; ; XOP-LABEL: insert_v7i8_v2i16_2: Index: llvm/test/CodeGen/X86/paddus.ll =================================================================== --- llvm/test/CodeGen/X86/paddus.ll +++ llvm/test/CodeGen/X86/paddus.ll @@ -1527,30 +1527,13 @@ ; SSE-NEXT: movq %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: addus_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: addus_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: addus_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovwb %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: addus_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <8 x i8>, <8 x i8>* %p1, align 8 %ld2 = load <8 x i8>, <8 x i8>* %p2, align 8 %1 = add <8 x i8> %ld2, %ld1 @@ -1569,30 +1552,13 @@ ; SSE-NEXT: movd %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: addus_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: addus_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: addus_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpmovdb %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: addus_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <4 x i8>, <4 x i8>* %p1, align 4 %ld2 = load <4 x i8>, <4 x i8>* %p2, align 4 %1 = add <4 x i8> %ld2, %ld1 @@ -1635,36 +1601,15 @@ ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: addus_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movzwl (%rsi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: addus_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movzwl (%rsi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: addus_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: movzwl (%rsi), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovqb %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: addus_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movzwl (%rsi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpaddusb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <2 x i8>, <2 x i8>* %p1, align 2 %ld2 = load <2 x i8>, <2 x i8>* %p2, align 2 %1 = add <2 x i8> %ld2, %ld1 @@ -1683,30 +1628,13 @@ ; SSE-NEXT: movq %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: addus_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: addus_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: addus_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovdw %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: addus_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <4 x i16>, <4 x i16>* %p1, align 4 %ld2 = load <4 x i16>, <4 x i16>* %p2, align 4 %1 = add <4 x i16> %ld2, %ld1 @@ -1725,30 +1653,13 @@ ; SSE-NEXT: movd %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: addus_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: addus_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: addus_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovqw %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: addus_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpaddusw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <2 x i16>, <2 x i16>* %p1, align 2 %ld2 = load <2 x i16>, <2 x i16>* %p2, align 2 %1 = add <2 x i16> %ld2, %ld1 Index: llvm/test/CodeGen/X86/pmaddubsw.ll =================================================================== --- llvm/test/CodeGen/X86/pmaddubsw.ll +++ llvm/test/CodeGen/X86/pmaddubsw.ll @@ -324,23 +324,23 @@ ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm3 -; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 +; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -355,14 +355,14 @@ ; AVX2-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX2-NEXT: vpmulld %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 @@ -377,14 +377,14 @@ ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3 ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX512-NEXT: vpmulld %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 @@ -452,23 +452,23 @@ ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -481,13 +481,13 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero ; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 @@ -501,13 +501,13 @@ ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero ; AVX512-NEXT: vpmulld %ymm3, %ymm2, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 Index: llvm/test/CodeGen/X86/pmulh.ll =================================================================== --- llvm/test/CodeGen/X86/pmulh.ll +++ llvm/test/CodeGen/X86/pmulh.ll @@ -8,45 +8,14 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) { -; SSE2-PROMOTE-LABEL: mulhuw_v4i16: -; SSE2-PROMOTE: # %bb.0: -; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-PROMOTE-NEXT: pmulhuw %xmm1, %xmm0 -; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1 -; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-PROMOTE-NEXT: retq -; -; SSE2-WIDEN-LABEL: mulhuw_v4i16: -; SSE2-WIDEN: # %bb.0: -; SSE2-WIDEN-NEXT: pmulhuw %xmm1, %xmm0 -; SSE2-WIDEN-NEXT: retq -; -; SSE41-PROMOTE-LABEL: mulhuw_v4i16: -; SSE41-PROMOTE: # %bb.0: -; SSE41-PROMOTE-NEXT: pxor %xmm2, %xmm2 -; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0 -; SSE41-PROMOTE-NEXT: psrld $16, %xmm0 -; SSE41-PROMOTE-NEXT: retq -; -; SSE41-WIDEN-LABEL: mulhuw_v4i16: -; SSE41-WIDEN: # %bb.0: -; SSE41-WIDEN-NEXT: pmulhuw %xmm1, %xmm0 -; SSE41-WIDEN-NEXT: retq +; SSE-LABEL: mulhuw_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pmulhuw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: mulhuw_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = zext <4 x i16> %a to <4 x i32> %b1 = zext <4 x i16> %b to <4 x i32> @@ -57,47 +26,14 @@ } define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) { -; SSE2-PROMOTE-LABEL: mulhw_v4i16: -; SSE2-PROMOTE: # %bb.0: -; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-PROMOTE-NEXT: pmulhw %xmm1, %xmm0 -; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1 -; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-PROMOTE-NEXT: retq -; -; SSE2-WIDEN-LABEL: mulhw_v4i16: -; SSE2-WIDEN: # %bb.0: -; SSE2-WIDEN-NEXT: pmulhw %xmm1, %xmm0 -; SSE2-WIDEN-NEXT: retq -; -; SSE41-PROMOTE-LABEL: mulhw_v4i16: -; SSE41-PROMOTE: # %bb.0: -; SSE41-PROMOTE-NEXT: pslld $16, %xmm0 -; SSE41-PROMOTE-NEXT: psrad $16, %xmm0 -; SSE41-PROMOTE-NEXT: pslld $16, %xmm1 -; SSE41-PROMOTE-NEXT: psrad $16, %xmm1 -; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0 -; SSE41-PROMOTE-NEXT: psrld $16, %xmm0 -; SSE41-PROMOTE-NEXT: retq -; -; SSE41-WIDEN-LABEL: mulhw_v4i16: -; SSE41-WIDEN: # %bb.0: -; SSE41-WIDEN-NEXT: pmulhw %xmm1, %xmm0 -; SSE41-WIDEN-NEXT: retq +; SSE-LABEL: mulhw_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pmulhw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: mulhw_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = sext <4 x i16> %a to <4 x i32> %b1 = sext <4 x i16> %b to <4 x i32> Index: llvm/test/CodeGen/X86/pointer-vector.ll =================================================================== --- llvm/test/CodeGen/X86/pointer-vector.ll +++ llvm/test/CodeGen/X86/pointer-vector.ll @@ -117,7 +117,7 @@ ; CHECK-LABEL: BITCAST1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retl entry: %G = load <2 x i8*>, <2 x i8*>* %p Index: llvm/test/CodeGen/X86/pr14161.ll =================================================================== --- llvm/test/CodeGen/X86/pr14161.ll +++ llvm/test/CodeGen/X86/pr14161.ll @@ -7,7 +7,6 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movdqa (%rdi), %xmm0 ; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pmovzxwq %xmm0, %xmm0 ; CHECK-NEXT: retq entry: %2 = load <4 x i32>, <4 x i32>* %0, align 16 @@ -27,7 +26,6 @@ ; CHECK-NEXT: movdqa (%rdi), %xmm0 ; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-NEXT: retq entry: %2 = load <4 x i32>, <4 x i32>* %0, align 16 Index: llvm/test/CodeGen/X86/pr35918.ll =================================================================== --- llvm/test/CodeGen/X86/pr35918.ll +++ llvm/test/CodeGen/X86/pr35918.ll @@ -5,79 +5,31 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=X64,X64-SKX define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], [128 x i64] }*) nounwind { -; X86-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8: -; X86-SKYLAKE: # %bb.0: # %entry -; X86-SKYLAKE-NEXT: subl $8, %esp -; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0 -; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u] -; X86-SKYLAKE-NEXT: vmovd %xmm0, %ecx -; X86-SKYLAKE-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 -; X86-SKYLAKE-NEXT: movl %ecx, (%eax) -; X86-SKYLAKE-NEXT: addl $8, %esp -; X86-SKYLAKE-NEXT: retl +; X86-LABEL: fetch_r16g16_snorm_unorm8: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpsrlw $7, %xmm0, %xmm0 +; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u] +; X86-NEXT: vmovd %xmm0, %ecx +; X86-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: retl ; -; X86-SKX-LABEL: fetch_r16g16_snorm_unorm8: -; X86-SKX: # %bb.0: # %entry -; X86-SKX-NEXT: subl $8, %esp -; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u] -; X86-SKX-NEXT: vpsrad $16, %xmm0, %xmm0 -; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X86-SKX-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X86-SKX-NEXT: vpmovqw %xmm0, {{[0-9]+}}(%esp) -; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-SKX-NEXT: vpmovdb %xmm0, (%esp) -; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SKX-NEXT: movzwl (%esp), %ecx -; X86-SKX-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 -; X86-SKX-NEXT: movl %ecx, (%eax) -; X86-SKX-NEXT: addl $8, %esp -; X86-SKX-NEXT: retl -; -; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8: -; X64-SKYLAKE: # %bb.0: # %entry -; X64-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u] -; X64-SKYLAKE-NEXT: vmovd %xmm0, %eax -; X64-SKYLAKE-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; X64-SKYLAKE-NEXT: movl %eax, (%rdi) -; X64-SKYLAKE-NEXT: retq -; -; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8: -; X64-SKX: # %bb.0: # %entry -; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u] -; X64-SKX-NEXT: vpsrad $16, %xmm0, %xmm0 -; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) -; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-SKX-NEXT: vpmovdb %xmm0, -{{[0-9]+}}(%rsp) -; X64-SKX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; X64-SKX-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; X64-SKX-NEXT: movl %eax, (%rdi) -; X64-SKX-NEXT: retq +; X64-LABEL: fetch_r16g16_snorm_unorm8: +; X64: # %bb.0: # %entry +; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpsrlw $7, %xmm0, %xmm0 +; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u] +; X64-NEXT: vmovd %xmm0, %eax +; X64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: retq entry: %5 = bitcast i8* %1 to <2 x i16>* %6 = load <2 x i16>, <2 x i16>* %5, align 2 Index: llvm/test/CodeGen/X86/pr40994.ll =================================================================== --- llvm/test/CodeGen/X86/pr40994.ll +++ llvm/test/CodeGen/X86/pr40994.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: pextrb $10, %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: pextrb $12, %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: pextrb $14, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq %v = alloca i8, i32 8, align 16 call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> ) Index: llvm/test/CodeGen/X86/promote-vec3.ll =================================================================== --- llvm/test/CodeGen/X86/promote-vec3.ll +++ llvm/test/CodeGen/X86/promote-vec3.ll @@ -14,7 +14,7 @@ ; SSE3-NEXT: movd %edx, %xmm0 ; SSE3-NEXT: pinsrw $1, %ecx, %xmm0 ; SSE3-NEXT: pinsrw $2, %eax, %xmm0 -; SSE3-NEXT: pextrw $0, %xmm0, %eax +; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pextrw $1, %xmm0, %edx ; SSE3-NEXT: pextrw $2, %xmm0, %ecx ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -24,13 +24,13 @@ ; ; SSE41-LABEL: zext_i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0 -; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 -; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 +; SSE41-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: pextrw $2, %xmm0, %edx -; SSE41-NEXT: pextrw $4, %xmm0, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %edx +; SSE41-NEXT: pextrw $2, %xmm0, %ecx ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: # kill: def $dx killed $dx killed $edx ; SSE41-NEXT: # kill: def $cx killed $cx killed $ecx @@ -38,13 +38,13 @@ ; ; AVX-32-LABEL: zext_i8: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-32-NEXT: vmovd %xmm0, %eax -; AVX-32-NEXT: vpextrw $2, %xmm0, %edx -; AVX-32-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-32-NEXT: vpextrw $1, %xmm0, %edx +; AVX-32-NEXT: vpextrw $2, %xmm0, %ecx ; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-32-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-32-NEXT: # kill: def $cx killed $cx killed $ecx @@ -53,12 +53,12 @@ ; AVX-64-LABEL: zext_i8: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovd %edi, %xmm0 -; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-64-NEXT: vmovd %xmm0, %eax -; AVX-64-NEXT: vpextrw $2, %xmm0, %edx -; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-64-NEXT: vpextrw $1, %xmm0, %edx +; AVX-64-NEXT: vpextrw $2, %xmm0, %ecx ; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx @@ -71,13 +71,16 @@ ; SSE3-LABEL: sext_i8: ; SSE3: # %bb.0: ; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: shll $8, %eax +; SSE3-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3-NEXT: shll $8, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 ; SSE3-NEXT: pinsrw $1, %eax, %xmm0 ; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE3-NEXT: shll $8, %eax ; SSE3-NEXT: pinsrw $2, %eax, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: psraw $8, %xmm0 -; SSE3-NEXT: pextrw $0, %xmm0, %eax +; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pextrw $1, %xmm0, %edx ; SSE3-NEXT: pextrw $2, %xmm0, %ecx ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -88,13 +91,12 @@ ; SSE41-LABEL: sext_i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 -; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: psrad $24, %xmm0 +; SSE41-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 +; SSE41-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0 +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: pextrw $2, %xmm0, %edx -; SSE41-NEXT: pextrw $4, %xmm0, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %edx +; SSE41-NEXT: pextrw $2, %xmm0, %ecx ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: # kill: def $dx killed $dx killed $edx ; SSE41-NEXT: # kill: def $cx killed $cx killed $ecx @@ -103,13 +105,12 @@ ; AVX-32-LABEL: sext_i8: ; AVX-32: # %bb.0: ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-32-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-32-NEXT: vmovd %xmm0, %eax -; AVX-32-NEXT: vpextrw $2, %xmm0, %edx -; AVX-32-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-32-NEXT: vpextrw $1, %xmm0, %edx +; AVX-32-NEXT: vpextrw $2, %xmm0, %ecx ; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-32-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-32-NEXT: # kill: def $cx killed $cx killed $ecx @@ -118,13 +119,12 @@ ; AVX-64-LABEL: sext_i8: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovd %edi, %xmm0 -; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; AVX-64-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-64-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-64-NEXT: vmovd %xmm0, %eax -; AVX-64-NEXT: vpextrw $2, %xmm0, %edx -; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-64-NEXT: vpextrw $1, %xmm0, %edx +; AVX-64-NEXT: vpextrw $2, %xmm0, %ecx ; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx Index: llvm/test/CodeGen/X86/promote.ll =================================================================== --- llvm/test/CodeGen/X86/promote.ll +++ llvm/test/CodeGen/X86/promote.ll @@ -6,18 +6,19 @@ ; X86-LABEL: mul_f: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: pmaddwd %xmm0, %xmm0 -; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: pmullw %xmm0, %xmm0 +; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: mul_f: ; X64: # %bb.0: # %entry -; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-NEXT: pmaddwd %xmm0, %xmm0 -; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X64-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-NEXT: pmullw %xmm0, %xmm0 +; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; X64-NEXT: movd %xmm0, (%rax) ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: retq @@ -32,18 +33,16 @@ ; X86-LABEL: shuff_f: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: paddd %xmm0, %xmm0 -; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: paddb %xmm0, %xmm0 ; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: shuff_f: ; X64: # %bb.0: # %entry -; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-NEXT: paddd %xmm0, %xmm0 -; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: paddb %xmm0, %xmm0 ; X64-NEXT: movd %xmm0, (%rax) ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/psubus.ll =================================================================== --- llvm/test/CodeGen/X86/psubus.ll +++ llvm/test/CodeGen/X86/psubus.ll @@ -2184,30 +2184,13 @@ ; SSE-NEXT: movq %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: subus_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: subus_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: subus_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovwb %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: subus_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <8 x i8>, <8 x i8>* %p1, align 8 %ld2 = load <8 x i8>, <8 x i8>* %p2, align 8 %1 = sub <8 x i8> %ld1, %ld2 @@ -2220,36 +2203,19 @@ define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) { ; SSE-LABEL: subus_v4i8: ; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: psubusb %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: subus_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: subus_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: subus_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpmovdb %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: subus_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <4 x i8>, <4 x i8>* %p1, align 8 %ld2 = load <4 x i8>, <4 x i8>* %p2, align 8 %1 = sub <4 x i8> %ld1, %ld2 @@ -2262,8 +2228,8 @@ define void @subus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) { ; SSE2-LABEL: subus_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: psubusb %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) @@ -2271,8 +2237,8 @@ ; ; SSSE3-LABEL: subus_v2i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: psubusb %xmm1, %xmm0 ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) @@ -2280,36 +2246,19 @@ ; ; SSE41-LABEL: subus_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE41-NEXT: psubusb %xmm1, %xmm0 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: subus_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: subus_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: subus_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovqb %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: subus_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <2 x i8>, <2 x i8>* %p1, align 8 %ld2 = load <2 x i8>, <2 x i8>* %p2, align 8 %1 = sub <2 x i8> %ld1, %ld2 @@ -2328,30 +2277,13 @@ ; SSE-NEXT: movq %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: subus_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: subus_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: subus_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovdw %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: subus_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <4 x i16>, <4 x i16>* %p1, align 8 %ld2 = load <4 x i16>, <4 x i16>* %p2, align 8 %1 = sub <4 x i16> %ld1, %ld2 @@ -2364,36 +2296,19 @@ define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) { ; SSE-LABEL: subus_v2i16: ; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: psubusw %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: subus_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: subus_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: subus_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovqw %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: subus_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq %ld1 = load <2 x i16>, <2 x i16>* %p1, align 8 %ld2 = load <2 x i16>, <2 x i16>* %p2, align 8 %1 = sub <2 x i16> %ld1, %ld2 Index: llvm/test/CodeGen/X86/ret-mmx.ll =================================================================== --- llvm/test/CodeGen/X86/ret-mmx.ll +++ llvm/test/CodeGen/X86/ret-mmx.ll @@ -33,7 +33,7 @@ ; CHECK-LABEL: t3: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retq ret <2 x i32> } Index: llvm/test/CodeGen/X86/sad.ll =================================================================== --- llvm/test/CodeGen/X86/sad.ll +++ llvm/test/CodeGen/X86/sad.ll @@ -1074,12 +1074,13 @@ ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psadbw %xmm3, %xmm2 -; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; @@ -1096,12 +1097,13 @@ ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX-NEXT: addq $4, %rax ; AVX-NEXT: jne .LBB3_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: Index: llvm/test/CodeGen/X86/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -210,30 +210,13 @@ ; SSE-NEXT: movq %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovwb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <8 x i8>, <8 x i8>* %px %y = load <8 x i8>, <8 x i8>* %py %z = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y) @@ -250,30 +233,13 @@ ; SSE-NEXT: movd %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpmovdb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px %y = load <4 x i8>, <4 x i8>* %py %z = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y) @@ -314,36 +280,15 @@ ; SSE41-NEXT: pextrw $0, %xmm1, (%rdx) ; SSE41-NEXT: retq ; -; AVX1-LABEL: v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movzwl (%rsi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movzwl (%rsi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: movzwl (%rsi), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovqb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i8: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movzwl (%rsi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py %z = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %x, <2 x i8> %y) @@ -360,30 +305,13 @@ ; SSE-NEXT: movq %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovdw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px %y = load <4 x i16>, <4 x i16>* %py %z = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %x, <4 x i16> %y) @@ -400,30 +328,13 @@ ; SSE-NEXT: movd %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovqw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py %z = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y) @@ -658,240 +569,133 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: paddq %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm5 ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: psrld $1, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddq %xmm1, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: pandn %xmm4, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pandn %xmm4, %xmm3 +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 ; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1 +; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 ; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2 ; AVX512-NEXT: kxorw %k2, %k1, %k1 ; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} -; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2} +; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512-NEXT: retq %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z Index: llvm/test/CodeGen/X86/scalar_widen_div.ll =================================================================== --- llvm/test/CodeGen/X86/scalar_widen_div.ll +++ llvm/test/CodeGen/X86/scalar_widen_div.ll @@ -147,30 +147,34 @@ define <4 x i16> @test_ushort_div(<4 x i16> %num, <4 x i16> %div) { ; CHECK-LABEL: test_ushort_div: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; CHECK-NEXT: pextrd $1, %xmm0, %eax -; CHECK-NEXT: pextrd $1, %xmm1, %ecx +; CHECK-NEXT: pextrw $1, %xmm0, %eax +; CHECK-NEXT: pextrw $1, %xmm1, %ecx +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ecx +; CHECK-NEXT: divw %cx ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: movd %xmm1, %esi +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %esi +; CHECK-NEXT: divw %si +; CHECK-NEXT: # kill: def $ax killed $ax def $eax ; CHECK-NEXT: movd %eax, %xmm2 -; CHECK-NEXT: pinsrd $1, %ecx, %xmm2 -; CHECK-NEXT: pextrd $2, %xmm0, %eax -; CHECK-NEXT: pextrd $2, %xmm1, %ecx +; CHECK-NEXT: pinsrw $1, %ecx, %xmm2 +; CHECK-NEXT: pextrw $2, %xmm0, %eax +; CHECK-NEXT: pextrw $2, %xmm1, %ecx +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ecx -; CHECK-NEXT: pinsrd $2, %eax, %xmm2 -; CHECK-NEXT: pextrd $3, %xmm0, %eax -; CHECK-NEXT: pextrd $3, %xmm1, %ecx +; CHECK-NEXT: divw %cx +; CHECK-NEXT: # kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $2, %eax, %xmm2 +; CHECK-NEXT: pextrw $3, %xmm0, %eax +; CHECK-NEXT: pextrw $3, %xmm1, %ecx +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ecx -; CHECK-NEXT: pinsrd $3, %eax, %xmm2 +; CHECK-NEXT: divw %cx +; CHECK-NEXT: # kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $3, %eax, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %div.r = udiv <4 x i16> %num, %div @@ -257,31 +261,34 @@ define <4 x i8> @test_char_rem(<4 x i8> %num, <4 x i8> %rem) { ; CHECK-LABEL: test_char_rem: ; CHECK: # %bb.0: -; CHECK-NEXT: pslld $24, %xmm1 -; CHECK-NEXT: psrad $24, %xmm1 -; CHECK-NEXT: pslld $24, %xmm0 -; CHECK-NEXT: psrad $24, %xmm0 -; CHECK-NEXT: pextrd $1, %xmm0, %eax -; CHECK-NEXT: pextrd $1, %xmm1, %ecx -; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %ecx -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movd %xmm1, %esi -; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %esi -; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pinsrd $1, %ecx, %xmm2 -; CHECK-NEXT: pextrd $2, %xmm0, %eax -; CHECK-NEXT: pextrd $2, %xmm1, %ecx -; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %ecx -; CHECK-NEXT: pinsrd $2, %edx, %xmm2 -; CHECK-NEXT: pextrd $3, %xmm0, %eax -; CHECK-NEXT: pextrd $3, %xmm1, %ecx -; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %ecx -; CHECK-NEXT: pinsrd $3, %edx, %xmm2 +; CHECK-NEXT: pextrb $1, %xmm0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: cbtw +; CHECK-NEXT: pextrb $1, %xmm1, %ecx +; CHECK-NEXT: idivb %cl +; CHECK-NEXT: movsbl %ah, %ecx +; CHECK-NEXT: pextrb $0, %xmm0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: cbtw +; CHECK-NEXT: pextrb $0, %xmm1, %edx +; CHECK-NEXT: idivb %dl +; CHECK-NEXT: movsbl %ah, %eax +; CHECK-NEXT: movd %eax, %xmm2 +; CHECK-NEXT: pextrb $2, %xmm0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: cbtw +; CHECK-NEXT: pinsrb $1, %ecx, %xmm2 +; CHECK-NEXT: pextrb $2, %xmm1, %ecx +; CHECK-NEXT: idivb %cl +; CHECK-NEXT: movsbl %ah, %ecx +; CHECK-NEXT: pextrb $3, %xmm0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: cbtw +; CHECK-NEXT: pinsrb $2, %ecx, %xmm2 +; CHECK-NEXT: pextrb $3, %xmm1, %ecx +; CHECK-NEXT: idivb %cl +; CHECK-NEXT: movsbl %ah, %eax +; CHECK-NEXT: pinsrb $3, %eax, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %rem.r = srem <4 x i8> %num, %rem Index: llvm/test/CodeGen/X86/select.ll =================================================================== --- llvm/test/CodeGen/X86/select.ll +++ llvm/test/CodeGen/X86/select.ll @@ -215,17 +215,27 @@ } define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind { -; CHECK-LABEL: test5: -; CHECK: ## %bb.0: -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne LBB4_2 -; CHECK-NEXT: ## %bb.1: -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: LBB4_2: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-NEXT: movd %xmm0, (%rsi) -; CHECK-NEXT: retq +; GENERIC-LABEL: test5: +; GENERIC: ## %bb.0: +; GENERIC-NEXT: testb $1, %dil +; GENERIC-NEXT: jne LBB4_2 +; GENERIC-NEXT: ## %bb.1: +; GENERIC-NEXT: movaps %xmm1, %xmm0 +; GENERIC-NEXT: LBB4_2: +; GENERIC-NEXT: movss %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test5: +; ATOM: ## %bb.0: +; ATOM-NEXT: testb $1, %dil +; ATOM-NEXT: jne LBB4_2 +; ATOM-NEXT: ## %bb.1: +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: LBB4_2: +; ATOM-NEXT: movss %xmm0, (%rsi) +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq ; ; ATHLON-LABEL: test5: ; ATHLON: ## %bb.0: Index: llvm/test/CodeGen/X86/shrink_vmul.ll =================================================================== --- llvm/test/CodeGen/X86/shrink_vmul.ll +++ llvm/test/CodeGen/X86/shrink_vmul.ll @@ -42,10 +42,13 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -68,10 +71,13 @@ ; X64-AVX-LABEL: mul_2xi8: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm1 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -475,10 +481,11 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -498,10 +505,11 @@ ; X64-AVX-LABEL: mul_2xi16: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -904,10 +912,13 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 -; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -932,10 +943,13 @@ ; X64-AVX-LABEL: mul_2xi8_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 -; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm1 +; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -992,10 +1006,13 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1021,10 +1038,13 @@ ; X64-AVX-LABEL: mul_2xi8_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm1 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1075,10 +1095,11 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 -; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1098,10 +1119,11 @@ ; X64-AVX-LABEL: mul_2xi16_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 -; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1138,14 +1160,15 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1156,10 +1179,11 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1170,23 +1194,25 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1379,8 +1405,8 @@ ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1389,9 +1415,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1402,17 +1429,18 @@ ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1454,9 +1482,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1476,9 +1505,10 @@ ; X64-AVX-LABEL: mul_2xi8_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1509,11 +1539,8 @@ ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1522,9 +1549,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1535,20 +1563,18 @@ ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1592,9 +1618,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1616,9 +1643,10 @@ ; X64-AVX-LABEL: mul_2xi8_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1662,9 +1690,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1686,9 +1715,10 @@ ; X64-AVX-LABEL: mul_2xi8_varconst5: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1732,9 +1762,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1756,9 +1787,10 @@ ; X64-AVX-LABEL: mul_2xi8_varconst6: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1799,9 +1831,9 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1820,9 +1852,9 @@ ; X64-AVX-LABEL: mul_2xi16_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1863,9 +1895,9 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1884,9 +1916,9 @@ ; X64-AVX-LABEL: mul_2xi16_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1916,9 +1948,12 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1927,9 +1962,9 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1939,18 +1974,21 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1980,9 +2018,12 @@ ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1991,9 +2032,9 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -2003,18 +2044,21 @@ ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: Index: llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -44,32 +44,12 @@ ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v8i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v8i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v8i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v8i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -100,31 +80,12 @@ ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v8i16_to_v4i16_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_to_v4i16_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v8i16_to_v4i16_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v8i16_to_v4i16_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> store <4 x i16> %strided.vec, <4 x i16>* %S @@ -144,29 +105,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S @@ -204,32 +147,12 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v4i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v4i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -263,31 +186,12 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v4i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v4i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -325,31 +229,12 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v4i8_3: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_3: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v4i8_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -394,8 +279,9 @@ ; ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1: @@ -407,8 +293,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> @@ -454,8 +341,9 @@ ; ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2: @@ -467,8 +355,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> @@ -514,8 +403,9 @@ ; ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3: @@ -527,8 +417,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> @@ -565,32 +456,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S @@ -623,31 +494,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S @@ -683,31 +535,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8_3: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_3: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S @@ -740,31 +573,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8_4: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_4: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_4: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_4: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8_4: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S @@ -800,31 +614,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8_5: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_5: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlq $40, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_5: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_5: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8_5: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S @@ -857,31 +652,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8_6: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_6: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_6: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_6: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8_6: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S @@ -917,31 +693,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8_7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_7: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlq $56, (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_7: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8_7: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S Index: llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -125,49 +125,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v8i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -186,46 +153,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15] -; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v8i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -244,49 +181,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v8i8_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -339,11 +243,11 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1: @@ -360,9 +264,9 @@ ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,4,5,12,13] ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi) +; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -414,9 +318,13 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2: @@ -432,9 +340,10 @@ ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [2,6,10,14,2,3,10,11] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -488,11 +397,11 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3: @@ -509,9 +418,9 @@ ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,11,15,2,3,10,11] ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi) +; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -531,49 +440,16 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v4i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -592,46 +468,16 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13] -; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v4i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -650,49 +496,16 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v4i8_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -711,41 +524,16 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v4i8_4: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -764,49 +552,16 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v4i8_5: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -825,46 +580,16 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11] -; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v4i8_6: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -883,49 +608,16 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i8_to_v4i8_7: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S Index: llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -425,77 +425,23 @@ } define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -503,68 +449,23 @@ } define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29] -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -572,77 +473,23 @@ } define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -650,68 +497,23 @@ } define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30] -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_4: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -719,77 +521,23 @@ } define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_5: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -797,68 +545,23 @@ } define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31] -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_6: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -866,77 +569,23 @@ } define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_7: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S Index: llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -37,32 +37,12 @@ ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -148,31 +128,12 @@ ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v8i16_to_v4i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_to_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v8i16_to_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v8i16_to_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> store <4 x i16> %strided.vec, <4 x i16>* %S @@ -247,29 +208,11 @@ ; AVX-NEXT: vmovlps %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v4i32_to_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v4i32_to_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> store <2 x i32> %strided.vec, <2 x i32>* %S @@ -343,31 +286,12 @@ ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v4i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S @@ -469,7 +393,8 @@ ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: @@ -482,7 +407,8 @@ ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> @@ -577,31 +503,12 @@ ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i8_to_v2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_to_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i8_to_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i8_to_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> store <2 x i8> %strided.vec, <2 x i8>* %S Index: llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -408,17 +408,20 @@ ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] -; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] -; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -427,32 +430,21 @@ } define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX1-LABEL: trunc_v8i32_to_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_to_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: trunc_v8i32_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v8i32_to_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -467,8 +459,7 @@ ; AVX512BW-LABEL: trunc_v8i32_to_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -499,53 +490,59 @@ ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated.vec = trunc <8 x i32> %vec to <8 x i8> @@ -677,53 +674,59 @@ ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <8 x i32> %vec to <8 x i8> @@ -737,58 +740,72 @@ ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i16> @@ -930,58 +947,72 @@ ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i16> @@ -993,58 +1024,66 @@ ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VBMIVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i8> @@ -1096,9 +1135,13 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: @@ -1114,16 +1157,18 @@ ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512VBMIVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] +; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -1134,34 +1179,39 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-LABEL: trunc_v4i64_to_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1176,8 +1226,7 @@ ; AVX512BW-LABEL: trunc_v4i64_to_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1227,9 +1276,13 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: @@ -1245,16 +1298,21 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512VBMIVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -1263,36 +1321,21 @@ } define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX-LABEL: trunc_v4i64_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1307,8 +1350,7 @@ ; AVX512BW-LABEL: trunc_v4i64_to_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -585,11 +585,20 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: @@ -612,10 +621,10 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224] +; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -884,6 +893,7 @@ ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %truncated = trunc <8 x i64> %vec to <8 x i8> Index: llvm/test/CodeGen/X86/slow-pmulld.ll =================================================================== --- llvm/test/CodeGen/X86/slow-pmulld.ll +++ llvm/test/CodeGen/X86/slow-pmulld.ll @@ -20,74 +20,74 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) { ; CHECK32-LABEL: test_mul_v4i32_v4i8: ; CHECK32: # %bb.0: -; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; CHECK32-NEXT: retl ; ; CHECK64-LABEL: test_mul_v4i32_v4i8: ; CHECK64: # %bb.0: -; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; CHECK64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v4i32_v4i8: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v4i32_v4i8: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; SSE4-64-NEXT: retq ; ; AVX2-32-LABEL: test_mul_v4i32_v4i8: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; AVX2-32-NEXT: retl ; ; AVX2-64-LABEL: test_mul_v4i32_v4i8: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-64-NEXT: retq ; ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8: ; AVX512DQ-32: # %bb.0: -; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; AVX512DQ-32-NEXT: retl ; ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8: ; AVX512DQ-64: # %bb.0: -; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-64-NEXT: retq ; ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8: ; AVX512BW-32: # %bb.0: -; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; AVX512BW-32-NEXT: retl ; ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8: ; AVX512BW-64: # %bb.0: -; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-64-NEXT: retq ; ; KNL-32-LABEL: test_mul_v4i32_v4i8: ; KNL-32: # %bb.0: -; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; KNL-32-NEXT: retl ; ; KNL-64-LABEL: test_mul_v4i32_v4i8: ; KNL-64: # %bb.0: -; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; KNL-64-NEXT: retq @@ -99,34 +99,31 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SLM32-LABEL: test_mul_v8i32_v8i8: ; SLM32: # %bb.0: -; SLM32-NEXT: movdqa %xmm0, %xmm1 -; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm1 +; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM32-NEXT: movdqa %xmm1, %xmm2 ; SLM32-NEXT: pmullw %xmm0, %xmm1 ; SLM32-NEXT: pmulhw %xmm0, %xmm2 ; SLM32-NEXT: movdqa %xmm1, %xmm0 -; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v8i32_v8i8: ; SLM64: # %bb.0: -; SLM64-NEXT: movdqa %xmm0, %xmm1 -; SLM64-NEXT: pand {{.*}}(%rip), %xmm1 +; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM64-NEXT: movdqa %xmm1, %xmm2 ; SLM64-NEXT: pmullw %xmm0, %xmm1 ; SLM64-NEXT: pmulhw %xmm0, %xmm2 ; SLM64-NEXT: movdqa %xmm1, %xmm0 -; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v8i32_v8i8: ; SLOW32: # %bb.0: -; SLOW32-NEXT: movdqa %xmm0, %xmm1 -; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm1 +; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLOW32-NEXT: movdqa %xmm1, %xmm2 ; SLOW32-NEXT: pmulhw %xmm0, %xmm2 @@ -138,8 +135,7 @@ ; ; SLOW64-LABEL: test_mul_v8i32_v8i8: ; SLOW64: # %bb.0: -; SLOW64-NEXT: movdqa %xmm0, %xmm1 -; SLOW64-NEXT: pand {{.*}}(%rip), %xmm1 +; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLOW64-NEXT: movdqa %xmm1, %xmm2 ; SLOW64-NEXT: pmulhw %xmm0, %xmm2 @@ -151,10 +147,9 @@ ; ; SSE4-32-LABEL: test_mul_v8i32_v8i8: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 @@ -162,10 +157,9 @@ ; ; SSE4-64-LABEL: test_mul_v8i32_v8i8: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 @@ -173,58 +167,50 @@ ; ; AVX2-32-LABEL: test_mul_v8i32_v8i8: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX2-32-NEXT: retl ; ; AVX2-64-LABEL: test_mul_v8i32_v8i8: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-64-NEXT: retq ; ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8: ; AVX512DQ-32: # %bb.0: -; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX512DQ-32-NEXT: retl ; ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8: ; AVX512DQ-64: # %bb.0: -; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-64-NEXT: retq ; ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8: ; AVX512BW-32: # %bb.0: -; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX512BW-32-NEXT: retl ; ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8: ; AVX512BW-64: # %bb.0: -; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-64-NEXT: retq ; ; KNL-32-LABEL: test_mul_v8i32_v8i8: ; KNL-32: # %bb.0: -; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; KNL-32-NEXT: retl ; ; KNL-64-LABEL: test_mul_v8i32_v8i8: ; KNL-64: # %bb.0: -; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; KNL-64-NEXT: retq @@ -409,72 +395,46 @@ } define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) { -; SLM32-LABEL: test_mul_v4i32_v4i16: -; SLM32: # %bb.0: -; SLM32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SLM32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; SLM32-NEXT: movdqa %xmm0, %xmm2 -; SLM32-NEXT: pmullw %xmm1, %xmm0 -; SLM32-NEXT: pmulhuw %xmm1, %xmm2 -; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLM32-NEXT: retl -; -; SLM64-LABEL: test_mul_v4i32_v4i16: -; SLM64: # %bb.0: -; SLM64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SLM64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; SLM64-NEXT: movdqa %xmm0, %xmm2 -; SLM64-NEXT: pmullw %xmm1, %xmm0 -; SLM64-NEXT: pmulhuw %xmm1, %xmm2 -; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLM64-NEXT: retq -; -; SLOW32-LABEL: test_mul_v4i32_v4i16: -; SLOW32: # %bb.0: -; SLOW32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; SLOW32-NEXT: movdqa %xmm0, %xmm2 -; SLOW32-NEXT: pmulhuw %xmm1, %xmm2 -; SLOW32-NEXT: pmullw %xmm1, %xmm0 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLOW32-NEXT: retl +; CHECK32-LABEL: test_mul_v4i32_v4i16: +; CHECK32: # %bb.0: +; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; CHECK32-NEXT: movdqa %xmm0, %xmm2 +; CHECK32-NEXT: pmulhuw %xmm1, %xmm2 +; CHECK32-NEXT: pmullw %xmm1, %xmm0 +; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK32-NEXT: retl ; -; SLOW64-LABEL: test_mul_v4i32_v4i16: -; SLOW64: # %bb.0: -; SLOW64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; SLOW64-NEXT: movdqa %xmm0, %xmm2 -; SLOW64-NEXT: pmulhuw %xmm1, %xmm2 -; SLOW64-NEXT: pmullw %xmm1, %xmm0 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLOW64-NEXT: retq +; CHECK64-LABEL: test_mul_v4i32_v4i16: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; CHECK64-NEXT: movdqa %xmm0, %xmm2 +; CHECK64-NEXT: pmulhuw %xmm1, %xmm2 +; CHECK64-NEXT: pmullw %xmm1, %xmm0 +; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v4i32_v4i16: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pxor %xmm1, %xmm1 -; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v4i32_v4i16: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pxor %xmm1, %xmm1 -; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0 ; SSE4-64-NEXT: retq ; ; AVX-32-LABEL: test_mul_v4i32_v4i16: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_mul_v4i32_v4i16: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -722,74 +682,74 @@ define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize { ; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize: ; CHECK32: # %bb.0: -; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; CHECK32-NEXT: retl ; ; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize: ; CHECK64: # %bb.0: -; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; CHECK64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; SSE4-64-NEXT: retq ; ; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; AVX2-32-NEXT: retl ; ; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-64-NEXT: retq ; ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize: ; AVX512DQ-32: # %bb.0: -; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; AVX512DQ-32-NEXT: retl ; ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize: ; AVX512DQ-64: # %bb.0: -; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-64-NEXT: retq ; ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize: ; AVX512BW-32: # %bb.0: -; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; AVX512BW-32-NEXT: retl ; ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize: ; AVX512BW-64: # %bb.0: -; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-64-NEXT: retq ; ; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize: ; KNL-32: # %bb.0: -; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; KNL-32-NEXT: retl ; ; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize: ; KNL-64: # %bb.0: -; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; KNL-64-NEXT: retq @@ -801,32 +761,29 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { ; SLM32-LABEL: test_mul_v8i32_v8i8_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm0 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM32-NEXT: pmaddwd %xmm2, %xmm0 ; SLM32-NEXT: pmaddwd %xmm2, %xmm1 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: pand {{.*}}(%rip), %xmm0 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM64-NEXT: pmaddwd %xmm2, %xmm0 ; SLM64-NEXT: pmaddwd %xmm2, %xmm1 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize: ; SLOW32: # %bb.0: -; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SLOW32-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm1 @@ -834,10 +791,9 @@ ; ; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize: ; SLOW64: # %bb.0: -; SLOW64-NEXT: pand {{.*}}(%rip), %xmm0 -; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SLOW64-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm1 @@ -845,10 +801,9 @@ ; ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 @@ -856,10 +811,9 @@ ; ; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 @@ -867,58 +821,50 @@ ; ; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX2-32-NEXT: retl ; ; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-64-NEXT: retq ; ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize: ; AVX512DQ-32: # %bb.0: -; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX512DQ-32-NEXT: retl ; ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize: ; AVX512DQ-64: # %bb.0: -; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-64-NEXT: retq ; ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize: ; AVX512BW-32: # %bb.0: -; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 ; AVX512BW-32-NEXT: retl ; ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize: ; AVX512BW-64: # %bb.0: -; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-64-NEXT: retq ; ; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize: ; KNL-32: # %bb.0: -; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; KNL-32-NEXT: retl ; ; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize: ; KNL-64: # %bb.0: -; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; KNL-64-NEXT: retq @@ -1087,44 +1033,38 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize { ; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize: ; CHECK32: # %bb.0: -; CHECK32-NEXT: pxor %xmm1, %xmm1 -; CHECK32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 ; CHECK32-NEXT: retl ; ; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize: ; CHECK64: # %bb.0: -; CHECK64-NEXT: pxor %xmm1, %xmm1 -; CHECK64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pxor %xmm1, %xmm1 -; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pxor %xmm1, %xmm1 -; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0 ; SSE4-64-NEXT: retq ; ; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq Index: llvm/test/CodeGen/X86/sse2-intrinsics-canonical.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-canonical.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-canonical.ll @@ -93,40 +93,17 @@ define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) { ; SSE-LABEL: test_x86_sse2_paddus_b_64: ; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4 -; SSE-NEXT: pand %xmm2, %xmm1 ## encoding: [0x66,0x0f,0xdb,0xca] -; SSE-NEXT: packuswb %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x67,0xc9] -; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] -; SSE-NEXT: packuswb %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc0] ; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] -; SSE-NEXT: punpcklbw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x60,0xc0] -; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_paddus_b_64: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2] ; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] -; AVX2-NEXT: vpmovzxbw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x30,0xc0] -; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_paddus_b_64: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa LCPI4_0, %xmm2 ## EVEX TO VEX Compression xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4 -; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca] -; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2] ; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] -; SKX-NEXT: vpmovzxbw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xc0] -; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX-NEXT: retl ## encoding: [0xc3] %1 = add <8 x i8> %a0, %a1 %2 = icmp ugt <8 x i8> %a0, %1 @@ -137,45 +114,17 @@ define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_paddus_w_64: ; SSE: ## %bb.0: -; SSE-NEXT: pshuflw $232, %xmm1, %xmm1 ## encoding: [0xf2,0x0f,0x70,0xc9,0xe8] -; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw $232, %xmm1, %xmm1 ## encoding: [0xf3,0x0f,0x70,0xc9,0xe8] -; SSE-NEXT: ## xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd $232, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xe8] -; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw $232, %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x70,0xc0,0xe8] -; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw $232, %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x70,0xc0,0xe8] -; SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0xe8] -; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] -; SSE-NEXT: punpcklwd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x61,0xc0] -; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_paddus_w_64: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2] ; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] -; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x33,0xc0] -; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_paddus_w_64: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa LCPI5_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca] -; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2] ; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] -; SKX-NEXT: vpmovzxwd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] -; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SKX-NEXT: retl ## encoding: [0xc3] %1 = add <4 x i16> %a0, %a1 %2 = icmp ugt <4 x i16> %a0, %1 @@ -186,36 +135,17 @@ define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) { ; SSE-LABEL: test_x86_sse2_psubus_b_64: ; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4 -; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9] -; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda] -; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] -; SSE-NEXT: pmaxsw %xmm3, %xmm0 ## encoding: [0x66,0x0f,0xee,0xc3] -; SSE-NEXT: psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1] +; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_psubus_b_64: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2] -; AVX2-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3] -; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1] +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_psubus_b_64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,255,255,255,255,255,255,255] -; SKX-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4 -; SKX-NEXT: vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda] -; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2] -; SKX-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3] -; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1] +; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] ; SKX-NEXT: retl ## encoding: [0xc3] %cmp = icmp ugt <8 x i8> %a0, %a1 %sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1 @@ -226,41 +156,17 @@ define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_psubus_w_64: ; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 -; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9] -; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda] -; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] -; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] -; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3] -; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] -; SSE-NEXT: pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3] -; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0] -; SSE-NEXT: psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1] -; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_psubus_w_64: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; AVX2-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa] -; AVX2-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] -; AVX2-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc3] -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_psubus_w_64: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; SKX-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa] -; SKX-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] -; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SKX-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc3] -; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1] +; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] ; SKX-NEXT: retl ## encoding: [0xc3] %cmp = icmp ugt <4 x i16> %a0, %a1 %sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1 Index: llvm/test/CodeGen/X86/sse2-vector-shifts.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-vector-shifts.ll +++ llvm/test/CodeGen/X86/sse2-vector-shifts.ll @@ -321,8 +321,9 @@ define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind { ; CHECK-LABEL: shl_zext_srl_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: retq %srl = lshr <4 x i16> %x, %zext = zext <4 x i16> %srl to <4 x i32> @@ -334,6 +335,7 @@ ; CHECK-LABEL: sra_trunc_srl_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: psrad $19, %xmm0 +; CHECK-NEXT: packssdw %xmm0, %xmm0 ; CHECK-NEXT: retq %srl = lshr <4 x i32> %x, %trunc = trunc <4 x i32> %srl to <4 x i16> @@ -344,6 +346,7 @@ define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind { ; CHECK-LABEL: shl_zext_shl_v4i32: ; CHECK: # %bb.0: +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; CHECK-NEXT: pslld $19, %xmm0 ; CHECK-NEXT: retq %shl0 = shl <4 x i16> %x, Index: llvm/test/CodeGen/X86/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -210,30 +210,13 @@ ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovwb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <8 x i8>, <8 x i8>* %px %y = load <8 x i8>, <8 x i8>* %py %z = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %x, <8 x i8> %y) @@ -250,30 +233,13 @@ ; SSE-NEXT: movd %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpmovdb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px %y = load <4 x i8>, <4 x i8>* %py %z = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %x, <4 x i8> %y) @@ -314,36 +280,15 @@ ; SSE41-NEXT: pextrw $0, %xmm0, (%rdx) ; SSE41-NEXT: retq ; -; AVX1-LABEL: v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movzwl (%rsi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movzwl (%rsi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: movzwl (%rsi), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovqb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i8: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movzwl (%rsi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py %z = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %x, <2 x i8> %y) @@ -360,30 +305,13 @@ ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovdw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px %y = load <4 x i16>, <4 x i16>* %py %z = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %x, <4 x i16> %y) @@ -400,30 +328,13 @@ ; SSE-NEXT: movd %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovqw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py %z = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %x, <2 x i16> %y) @@ -658,248 +569,141 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psubq %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm0, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: psrld $1, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pandn %xmm3, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm3 +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 ; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 ; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1 +; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 ; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2 ; AVX512-NEXT: kxorw %k2, %k1, %k1 ; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} -; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2} +; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512-NEXT: retq %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z Index: llvm/test/CodeGen/X86/test-shrink-bug.ll =================================================================== --- llvm/test/CodeGen/X86/test-shrink-bug.ll +++ llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -69,11 +69,9 @@ ; CHECK-X64-NEXT: testl $263, %edi # imm = 0x107 ; CHECK-X64-NEXT: je .LBB1_3 ; CHECK-X64-NEXT: # %bb.1: -; CHECK-X64-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-X64-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 -; CHECK-X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] -; CHECK-X64-NEXT: pand %xmm0, %xmm1 -; CHECK-X64-NEXT: pextrw $4, %xmm1, %eax +; CHECK-X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax ; CHECK-X64-NEXT: testb $1, %al ; CHECK-X64-NEXT: jne .LBB1_3 ; CHECK-X64-NEXT: # %bb.2: # %no Index: llvm/test/CodeGen/X86/trunc-ext-ld-st.ll =================================================================== --- llvm/test/CodeGen/X86/trunc-ext-ld-st.ll +++ llvm/test/CodeGen/X86/trunc-ext-ld-st.ll @@ -8,23 +8,16 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: load_2_i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi) ; SSE41-NEXT: retq %T = load <2 x i8>, <2 x i8>* %A @@ -35,25 +28,12 @@ ; Read 32-bits define void @load_2_i16(<2 x i16>* %A) { -; SSE2-LABEL: load_2_i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: load_2_i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE41-NEXT: movd %xmm0, (%rdi) -; SSE41-NEXT: retq +; CHECK-LABEL: load_2_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movd %xmm0, (%rdi) +; CHECK-NEXT: retq %T = load <2 x i16>, <2 x i16>* %A %G = add <2 x i16> %T, store <2 x i16> %G, <2 x i16>* %A @@ -61,22 +41,12 @@ } define void @load_2_i32(<2 x i32>* %A) { -; SSE2-LABEL: load_2_i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: load_2_i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq +; CHECK-LABEL: load_2_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdi) +; CHECK-NEXT: retq %T = load <2 x i32>, <2 x i32>* %A %G = add <2 x i32> %T, store <2 x i32> %G, <2 x i32>* %A @@ -84,25 +54,12 @@ } define void @load_4_i8(<4 x i8>* %A) { -; SSE2-LABEL: load_4_i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: load_4_i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm0, (%rdi) -; SSE41-NEXT: retq +; CHECK-LABEL: load_4_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movd %xmm0, (%rdi) +; CHECK-NEXT: retq %T = load <4 x i8>, <4 x i8>* %A %G = add <4 x i8> %T, store <4 x i8> %G, <4 x i8>* %A @@ -110,24 +67,12 @@ } define void @load_4_i16(<4 x i16>* %A) { -; SSE2-LABEL: load_4_i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: load_4_i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq +; CHECK-LABEL: load_4_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdi) +; CHECK-NEXT: retq %T = load <4 x i16>, <4 x i16>* %A %G = add <4 x i16> %T, store <4 x i16> %G, <4 x i16>* %A Index: llvm/test/CodeGen/X86/trunc-subvector.ll =================================================================== --- llvm/test/CodeGen/X86/trunc-subvector.ll +++ llvm/test/CodeGen/X86/trunc-subvector.ll @@ -40,25 +40,14 @@ define <2 x i32> @test3(<8 x i32> %v) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX2-LABEL: test3: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test3: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test3: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -68,23 +57,13 @@ define <2 x i32> @test4(<8 x i32> %v) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test4: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -94,32 +73,31 @@ define <2 x i32> @test5(<8 x i32> %v) { ; SSE2-LABEL: test5: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test5: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test5: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> @@ -165,23 +143,13 @@ ; SSE2-LABEL: test8: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test8: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -191,22 +159,13 @@ define <2 x i32> @test9(<8 x i32> %v) { ; SSE2-LABEL: test9: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; AVX2-LABEL: test9: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test9: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test9: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> %s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> %t = trunc <2 x i64> %s to <2 x i32> @@ -216,28 +175,31 @@ define <2 x i32> @test10(<8 x i32> %v) { ; SSE2-LABEL: test10: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test10: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> Index: llvm/test/CodeGen/X86/uadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -210,30 +210,13 @@ ; SSE-NEXT: movq %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovwb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <8 x i8>, <8 x i8>* %px %y = load <8 x i8>, <8 x i8>* %py %z = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y) @@ -250,30 +233,13 @@ ; SSE-NEXT: movd %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpmovdb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px %y = load <4 x i8>, <4 x i8>* %py %z = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y) @@ -314,36 +280,15 @@ ; SSE41-NEXT: pextrw $0, %xmm1, (%rdx) ; SSE41-NEXT: retq ; -; AVX1-LABEL: v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movzwl (%rsi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movzwl (%rsi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: movzwl (%rsi), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovqb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i8: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movzwl (%rsi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py %z = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %x, <2 x i8> %y) @@ -360,30 +305,13 @@ ; SSE-NEXT: movq %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovdw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px %y = load <4 x i16>, <4 x i16>* %py %z = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %x, <4 x i16> %y) @@ -400,30 +328,13 @@ ; SSE-NEXT: movd %xmm1, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovqw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py %z = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y) @@ -631,97 +542,54 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: psrlq $32, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm0, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: paddd %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 ; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 -; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z Index: llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll =================================================================== --- llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -68,17 +68,13 @@ ; CHECK-SSE2-LABEL: out_v2i8: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v2i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <2 x i8> %x, %mask %notmask = xor <2 x i8> %mask, @@ -174,17 +170,13 @@ ; CHECK-SSE2-LABEL: out_v4i8: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v4i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -247,17 +239,13 @@ ; CHECK-SSE2-LABEL: out_v4i8_undef: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v4i8_undef: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -300,17 +288,13 @@ ; CHECK-SSE2-LABEL: out_v2i16: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v2i16: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <2 x i16> %x, %mask %notmask = xor <2 x i16> %mask, @@ -483,17 +467,13 @@ ; CHECK-SSE2-LABEL: out_v8i8: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v8i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <8 x i8> %x, %mask %notmask = xor <8 x i8> %mask, @@ -568,17 +548,13 @@ ; CHECK-SSE2-LABEL: out_v4i16: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v4i16: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -641,17 +617,13 @@ ; CHECK-SSE2-LABEL: out_v4i16_undef: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v4i16_undef: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -692,17 +664,13 @@ ; CHECK-SSE2-LABEL: out_v2i32: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 ; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: out_v2i32: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, Index: llvm/test/CodeGen/X86/usub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/usub_sat_vec.ll +++ llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -210,30 +210,13 @@ ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovwb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <8 x i8>, <8 x i8>* %px %y = load <8 x i8>, <8 x i8>* %py %z = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %x, <8 x i8> %y) @@ -250,30 +233,13 @@ ; SSE-NEXT: movd %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpmovdb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px %y = load <4 x i8>, <4 x i8>* %py %z = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %x, <4 x i8> %y) @@ -314,36 +280,15 @@ ; SSE41-NEXT: pextrw $0, %xmm0, (%rdx) ; SSE41-NEXT: retq ; -; AVX1-LABEL: v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movzwl (%rsi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movzwl (%rsi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: movzwl (%rsi), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovqb %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i8: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movzwl (%rsi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py %z = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %x, <2 x i8> %y) @@ -360,30 +305,13 @@ ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovdw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px %y = load <4 x i16>, <4 x i16>* %py %z = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %x, <4 x i16> %y) @@ -400,30 +328,13 @@ ; SSE-NEXT: movd %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdx) -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovqw %xmm0, (%rdx) -; AVX512-NEXT: retq +; AVX-LABEL: v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdx) +; AVX-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py %z = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %x, <2 x i16> %y) @@ -631,99 +542,37 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: psllq $32, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: psllq $32, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: psubq %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: psrlq $32, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z } Index: llvm/test/CodeGen/X86/vec_cast2.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast2.ll +++ llvm/test/CodeGen/X86/vec_cast2.ll @@ -5,13 +5,10 @@ define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) { ; CHECK-LABEL: cvt_v8i8_v8f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 -; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 -; CHECK-NEXT: vpslld $24, %xmm1, %xmm1 -; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl ; @@ -52,8 +49,7 @@ define <4 x float> @cvt_v4i8_v4f32(<4 x i8> %src) { ; CHECK-LABEL: cvt_v4i8_v4f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 -; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -69,8 +65,7 @@ define <4 x float> @cvt_v4i16_v4f32(<4 x i16> %src) { ; CHECK-LABEL: cvt_v4i16_v4f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpslld $16, %xmm0, %xmm0 -; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -86,11 +81,10 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) { ; CHECK-LABEL: cvt_v8u8_v8f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpand LCPI4_0, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl ; @@ -131,7 +125,7 @@ define <4 x float> @cvt_v4u8_v4f32(<4 x i8> %src) { ; CHECK-LABEL: cvt_v4u8_v4f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps LCPI6_0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -147,8 +141,7 @@ define <4 x float> @cvt_v4u16_v4f32(<4 x i16> %src) { ; CHECK-LABEL: cvt_v4u16_v4f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -167,6 +160,7 @@ ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl ; @@ -206,6 +200,7 @@ ; CHECK-LABEL: cvt_v4f32_v4i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4i8: @@ -221,6 +216,7 @@ ; CHECK-LABEL: cvt_v4f32_v4i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4i16: @@ -237,7 +233,8 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl ; @@ -277,6 +274,7 @@ ; CHECK-LABEL: cvt_v4f32_v4u8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4u8: @@ -292,6 +290,7 @@ ; CHECK-LABEL: cvt_v4f32_v4u16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4u16: Index: llvm/test/CodeGen/X86/vec_cast3.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast3.ll +++ llvm/test/CodeGen/X86/vec_cast3.ll @@ -5,9 +5,7 @@ define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2i8_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpsllq $56, %xmm0, %xmm0 -; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -23,9 +21,7 @@ define <2 x float> @cvt_v2i16_v2f32(<2 x i16> %src) { ; CHECK-LABEL: cvt_v2i16_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpsllq $48, %xmm0, %xmm0 -; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -41,7 +37,6 @@ define <2 x float> @cvt_v2i32_v2f32(<2 x i32> %src) { ; CHECK-LABEL: cvt_v2i32_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -56,7 +51,7 @@ define <2 x float> @cvt_v2u8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2u8_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -72,7 +67,7 @@ define <2 x float> @cvt_v2u16_v2f32(<2 x i16> %src) { ; CHECK-LABEL: cvt_v2u16_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,xmm0[10,11],zero,zero +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl ; @@ -88,10 +83,9 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) { ; CHECK-LABEL: cvt_v2u32_v2f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: retl @@ -112,7 +106,7 @@ ; CHECK-LABEL: cvt_v2f32_v2i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i8: @@ -128,7 +122,7 @@ ; CHECK-LABEL: cvt_v2f32_v2i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i16: @@ -144,7 +138,6 @@ ; CHECK-LABEL: cvt_v2f32_v2i32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i32: @@ -159,7 +152,7 @@ ; CHECK-LABEL: cvt_v2f32_v2u8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2u8: @@ -175,7 +168,7 @@ ; CHECK-LABEL: cvt_v2f32_v2u16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2u16: @@ -190,39 +183,13 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) { ; CHECK-LABEL: cvt_v2f32_v2u32: ; CHECK: ## %bb.0: -; CHECK-NEXT: subl $36, %esp -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vucomiss %xmm1, %xmm2 -; CHECK-NEXT: jb LBB11_2 -; CHECK-NEXT: ## %bb.1: -; CHECK-NEXT: vsubss %xmm1, %xmm2, %xmm2 -; CHECK-NEXT: LBB11_2: -; CHECK-NEXT: vmovss %xmm2, (%esp) -; CHECK-NEXT: flds (%esp) -; CHECK-NEXT: fisttpll (%esp) -; CHECK-NEXT: setae %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: shll $31, %eax -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: vucomiss %xmm1, %xmm0 -; CHECK-NEXT: jb LBB11_4 -; CHECK-NEXT: ## %bb.3: -; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: LBB11_4: -; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) -; CHECK-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp) -; CHECK-NEXT: setae %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: shll $31, %ecx -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; CHECK-NEXT: addl $36, %esp +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; CHECK-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vcvttps2dq %xmm1, %xmm1 +; CHECK-NEXT: vxorps LCPI11_1, %xmm1, %xmm1 +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2u32: Index: llvm/test/CodeGen/X86/vec_ctbits.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ctbits.ll +++ llvm/test/CodeGen/X86/vec_ctbits.ll @@ -110,9 +110,8 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promtz: ; CHECK: # %bb.0: -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: paddd %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 @@ -129,7 +128,12 @@ ; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: psadbw %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: packuswb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) @@ -139,44 +143,44 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promlz: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $1, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $2, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $4, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $8, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $16, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrld $2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $4, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrld $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psubb %xmm0, %xmm2 ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm0, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: psrlw $2, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddb %xmm3, %xmm2 +; CHECK-NEXT: paddb %xmm1, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: psrlw $4, %xmm0 ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: psadbw %xmm1, %xmm0 -; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -186,23 +190,27 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind { ; CHECK-LABEL: prompop: ; CHECK: # %bb.0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psubb %xmm1, %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 ; CHECK-NEXT: psrlw $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddb %xmm3, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psadbw %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: psadbw %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: packuswb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) Index: llvm/test/CodeGen/X86/vec_extract-mmx.ll =================================================================== --- llvm/test/CodeGen/X86/vec_extract-mmx.ll +++ llvm/test/CodeGen/X86/vec_extract-mmx.ll @@ -115,12 +115,10 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %ebp ; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $32, %esp ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -128,9 +126,7 @@ ; X64-LABEL: test4: ; X64: # %bb.0: ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq %tmp0 = bitcast x86_mmx %a to <2 x i32> %tmp1 = extractelement <2 x i32> %tmp0, i32 1 Index: llvm/test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -93,13 +93,11 @@ ; SSE-LABEL: fptosi_2f64_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> ret <2 x i32> %cvt @@ -336,53 +334,45 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_4i32: -; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; VEX-NEXT: retq +; AVX1-LABEL: fptoui_2f64_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_4i32: ; AVX512F: # %bb.0: @@ -417,51 +407,45 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: subsd %xmm1, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm1, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i32: -; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; VEX-NEXT: retq +; AVX1-LABEL: fptoui_2f64_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i32: ; AVX512F: # %bb.0: @@ -496,29 +480,13 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f64_to_2i32: @@ -764,46 +732,20 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: subsd %xmm2, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f64_to_4i32: @@ -874,13 +816,11 @@ ; SSE-LABEL: fptosi_2f32_to_2i32: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f32_to_2i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %cvt @@ -1259,77 +1199,66 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: cmpltps %xmm2, %xmm1 +; SSE-NEXT: cvttps2dq %xmm0, %xmm3 +; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f32_to_2i32: -; VEX: # %bb.0: -; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: retq +; AVX1-LABEL: fptoui_2f32_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f32_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VLDQ-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %cvt @@ -2225,7 +2154,8 @@ ; SSE-LABEL: fptosi_2f16_to_4i32: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rax -; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee @@ -2235,20 +2165,20 @@ ; SSE-NEXT: callq __gnu_f2h_ieee ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f16_to_4i32: ; VEX: # %bb.0: ; VEX-NEXT: pushq %rax -; VEX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; VEX-NEXT: vmovaps %xmm1, %xmm0 ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee @@ -2258,27 +2188,27 @@ ; VEX-NEXT: callq __gnu_f2h_ieee ; VEX-NEXT: movzwl %ax, %edi ; VEX-NEXT: callq __gnu_h2f_ieee -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vcvttss2si (%rsp), %rax # 4-byte Folded Reload -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; VEX-NEXT: vcvttss2si %xmm0, %eax +; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vmovd %eax, %xmm1 +; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: popq %rax ; VEX-NEXT: retq ; ; AVX512-LABEL: fptosi_2f16_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm1, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2si %xmm0, %eax +; AVX512-NEXT: vcvttss2si %xmm1, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> @@ -2295,32 +2225,31 @@ ; SSE-NEXT: orl $3072, %eax # imm = 0xC00 ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) ; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: orl $3072, %eax # imm = 0xC00 ; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) ; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f80_to_4i32: ; AVX: # %bb.0: ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq %cvt = fptosi <2 x x86_fp80> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> @@ -2330,51 +2259,44 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; SSE-LABEL: fptosi_2f128_to_4i32: ; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: movq %rdi, %rbx -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movl %eax, %ebp ; SSE-NEXT: movq %rbx, %rdi ; SSE-NEXT: movq %r14, %rsi -; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f128_to_4i32: ; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: movq %rsi, %r14 -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: movl %eax, %ebp ; AVX-NEXT: movq %rbx, %rdi ; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: callq __fixtfdi -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebp, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %rbp ; AVX-NEXT: retq %cvt = fptosi <2 x fp128> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> @@ -2385,41 +2307,16 @@ ; SSE-LABEL: fptosi_2f32_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f32_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovsxdq %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f32_to_2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f32_to_2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f32_to_2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2428,41 +2325,14 @@ ; SSE-LABEL: fptosi_2f32_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f32_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovsxdq %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f32_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f32_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f32_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2471,40 +2341,16 @@ ; SSE-LABEL: fptoui_2f32_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f32_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f32_to_2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f32_to_2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f32_to_2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2513,40 +2359,14 @@ ; SSE-LABEL: fptoui_2f32_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f32_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f32_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f32_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f32_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2555,41 +2375,16 @@ ; SSE-LABEL: fptosi_2f64_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f64_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovsxdq %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f64_to_2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f64_to_2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f64_to_2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f64_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2598,41 +2393,14 @@ ; SSE-LABEL: fptosi_2f64_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f64_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovsxdq %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f64_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f64_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f64_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2641,40 +2409,16 @@ ; SSE-LABEL: fptoui_2f64_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: xorpd %xmm1, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f64_to_2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f64_to_2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f64_to_2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f64_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2683,40 +2427,14 @@ ; SSE-LABEL: fptoui_2f64_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: xorpd %xmm1, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f64_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f64_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f64_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2895,10 +2613,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2907,10 +2625,10 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq Index: llvm/test/CodeGen/X86/vec_insert-5.ll =================================================================== --- llvm/test/CodeGen/X86/vec_insert-5.ll +++ llvm/test/CodeGen/X86/vec_insert-5.ll @@ -19,8 +19,7 @@ ; X64: # %bb.0: ; X64-NEXT: shll $12, %edi ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq %tmp12 = shl i32 %a, 12 Index: llvm/test/CodeGen/X86/vec_insert-7.ll =================================================================== --- llvm/test/CodeGen/X86/vec_insert-7.ll +++ llvm/test/CodeGen/X86/vec_insert-7.ll @@ -8,11 +8,15 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X32-LABEL: mmx_movzl: ; X32: ## %bb.0: -; X32-NEXT: subl $12, %esp +; X32-NEXT: subl $28, %esp +; X32-NEXT: movq %mm0, (%esp) +; X32-NEXT: movdqa (%esp), %xmm0 ; X32-NEXT: movl $32, %eax -; X32-NEXT: movd %eax, %xmm0 -; X32-NEXT: movdq2q %xmm0, %mm0 -; X32-NEXT: addl $12, %esp +; X32-NEXT: pinsrd $0, %eax, %xmm0 +; X32-NEXT: pxor %xmm1, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; X32-NEXT: movdq2q %xmm1, %mm0 +; X32-NEXT: addl $28, %esp ; X32-NEXT: retl ; ; X64-LABEL: mmx_movzl: Index: llvm/test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- llvm/test/CodeGen/X86/vec_insert-mmx.ll +++ llvm/test/CodeGen/X86/vec_insert-mmx.ll @@ -61,13 +61,7 @@ ; X32-NEXT: movl L_g0$non_lazy_ptr, %eax ; X32-NEXT: movl L_g1$non_lazy_ptr, %ecx ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-NEXT: movzwl (%eax), %eax -; X32-NEXT: movd %eax, %xmm1 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-NEXT: pinsrw $0, (%eax), %xmm0 ; X32-NEXT: movq %xmm0, (%ecx) ; X32-NEXT: retl ; @@ -75,10 +69,8 @@ ; X64: ## %bb.0: ; X64-NEXT: movq _g0@{{.*}}(%rip), %rax ; X64-NEXT: movq _g1@{{.*}}(%rip), %rcx -; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-NEXT: movzwl (%rax), %eax -; X64-NEXT: pinsrd $0, %eax, %xmm0 -; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pinsrw $0, (%rax), %xmm0 ; X64-NEXT: movq %xmm0, (%rcx) ; X64-NEXT: retq load i16, i16* @g0 Index: llvm/test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -3164,15 +3164,15 @@ ; ; SSE41-LABEL: sitofp_load_2i16_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_2i16_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i16>, <2 x i16> *%a @@ -3193,15 +3193,17 @@ ; ; SSE41-LABEL: sitofp_load_2i8_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 ; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a @@ -3732,15 +3734,17 @@ ; ; SSE41-LABEL: uitofp_load_2i8_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_2i8_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a @@ -5577,14 +5581,12 @@ ; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq 24(%rdi), %rax -; SSE41-NEXT: movdqu 8(%rdi), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0 +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 ; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, (%rax) -; SSE41-NEXT: movaps %xmm1, 16(%rax) +; SSE41-NEXT: movaps %xmm0, 16(%rax) +; SSE41-NEXT: movaps %xmm1, (%rax) ; SSE41-NEXT: retq ; ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: Index: llvm/test/CodeGen/X86/vec_saddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_saddo.ll +++ llvm/test/CodeGen/X86/vec_saddo.ll @@ -49,134 +49,73 @@ } define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { -; SSE2-LABEL: saddo_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: paddq %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: saddo_v2i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: paddq %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: movq %xmm1, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: saddo_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: paddq %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE41-NEXT: movq %xmm1, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: saddo_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: saddo_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: saddo_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: saddo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpmovqd %xmm0, (%rdi) -; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 @@ -1203,6 +1142,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: saddo_v2i64: @@ -1249,6 +1189,7 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pandn %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: saddo_v2i64: @@ -1284,13 +1225,14 @@ ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 -; SSE41-NEXT: pandn %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm2 +; SSE41-NEXT: pandn %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: retq ; ; AVX1-LABEL: saddo_v2i64: @@ -1307,6 +1249,7 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -1324,6 +1267,7 @@ ; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 ; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -1333,13 +1277,13 @@ ; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 ; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 ; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2 ; AVX512-NEXT: kxorw %k2, %k1, %k1 ; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -1626,44 +1570,44 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: testq %r9, %r9 +; SSE2-NEXT: setns %al +; SSE2-NEXT: testq %rsi, %rsi +; SSE2-NEXT: setns %bl +; SSE2-NEXT: cmpb %al, %bl +; SSE2-NEXT: sete %bpl +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: adcq %r9, %rsi +; SSE2-NEXT: setns %al +; SSE2-NEXT: cmpb %al, %bl +; SSE2-NEXT: setne %al +; SSE2-NEXT: andb %bpl, %al ; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: adcq %r11, %rax +; SSE2-NEXT: movq %rcx, %rbp +; SSE2-NEXT: adcq %r10, %rbp ; SSE2-NEXT: setns %bl ; SSE2-NEXT: testq %rcx, %rcx ; SSE2-NEXT: setns %cl ; SSE2-NEXT: cmpb %bl, %cl -; SSE2-NEXT: setne %bpl -; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: setne %r8b +; SSE2-NEXT: testq %r10, %r10 ; SSE2-NEXT: setns %bl ; SSE2-NEXT: cmpb %bl, %cl ; SSE2-NEXT: sete %cl -; SSE2-NEXT: andb %bpl, %cl -; SSE2-NEXT: movzbl %cl, %ebp -; SSE2-NEXT: testq %r9, %r9 -; SSE2-NEXT: setns %bl -; SSE2-NEXT: testq %rsi, %rsi -; SSE2-NEXT: setns %cl -; SSE2-NEXT: cmpb %bl, %cl -; SSE2-NEXT: sete %r11b -; SSE2-NEXT: addq %r8, %rdi -; SSE2-NEXT: adcq %r9, %rsi -; SSE2-NEXT: setns %bl -; SSE2-NEXT: cmpb %bl, %cl -; SSE2-NEXT: setne %cl -; SSE2-NEXT: andb %r11b, %cl +; SSE2-NEXT: andb %r8b, %cl ; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: pinsrw $4, %ebp, %xmm0 -; SSE2-NEXT: movq %rdx, 16(%r10) -; SSE2-NEXT: movq %rdi, (%r10) -; SSE2-NEXT: movq %rax, 24(%r10) -; SSE2-NEXT: movq %rsi, 8(%r10) -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq %rdx, 16(%r11) +; SSE2-NEXT: movq %rdi, (%r11) +; SSE2-NEXT: movq %rbp, 24(%r11) +; SSE2-NEXT: movq %rsi, 8(%r11) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq @@ -1672,44 +1616,44 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: pushq %rbp ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: testq %r9, %r9 +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: testq %rsi, %rsi +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: cmpb %al, %bl +; SSSE3-NEXT: sete %bpl +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: adcq %r9, %rsi +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: cmpb %al, %bl +; SSSE3-NEXT: setne %al +; SSSE3-NEXT: andb %bpl, %al ; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: adcq %r11, %rax +; SSSE3-NEXT: movq %rcx, %rbp +; SSSE3-NEXT: adcq %r10, %rbp ; SSSE3-NEXT: setns %bl ; SSSE3-NEXT: testq %rcx, %rcx ; SSSE3-NEXT: setns %cl ; SSSE3-NEXT: cmpb %bl, %cl -; SSSE3-NEXT: setne %bpl -; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: setne %r8b +; SSSE3-NEXT: testq %r10, %r10 ; SSSE3-NEXT: setns %bl ; SSSE3-NEXT: cmpb %bl, %cl ; SSSE3-NEXT: sete %cl -; SSSE3-NEXT: andb %bpl, %cl -; SSSE3-NEXT: movzbl %cl, %ebp -; SSSE3-NEXT: testq %r9, %r9 -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: testq %rsi, %rsi -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: cmpb %bl, %cl -; SSSE3-NEXT: sete %r11b -; SSSE3-NEXT: addq %r8, %rdi -; SSSE3-NEXT: adcq %r9, %rsi -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: cmpb %bl, %cl -; SSSE3-NEXT: setne %cl -; SSSE3-NEXT: andb %r11b, %cl +; SSSE3-NEXT: andb %r8b, %cl ; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0 -; SSSE3-NEXT: movq %rdx, 16(%r10) -; SSSE3-NEXT: movq %rdi, (%r10) -; SSSE3-NEXT: movq %rax, 24(%r10) -; SSSE3-NEXT: movq %rsi, 8(%r10) -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: negl %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: negl %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %rdx, 16(%r11) +; SSSE3-NEXT: movq %rdi, (%r11) +; SSSE3-NEXT: movq %rbp, 24(%r11) +; SSSE3-NEXT: movq %rsi, 8(%r11) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %rbp ; SSSE3-NEXT: retq @@ -1718,44 +1662,43 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pushq %rbp ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: testq %r9, %r9 +; SSE41-NEXT: setns %al +; SSE41-NEXT: testq %rsi, %rsi +; SSE41-NEXT: setns %bl +; SSE41-NEXT: cmpb %al, %bl +; SSE41-NEXT: sete %bpl +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: adcq %r9, %rsi +; SSE41-NEXT: setns %al +; SSE41-NEXT: cmpb %al, %bl +; SSE41-NEXT: setne %al +; SSE41-NEXT: andb %bpl, %al ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: adcq %r11, %rax +; SSE41-NEXT: movq %rcx, %rbp +; SSE41-NEXT: adcq %r10, %rbp ; SSE41-NEXT: setns %bl ; SSE41-NEXT: testq %rcx, %rcx ; SSE41-NEXT: setns %cl ; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %bpl -; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: setne %r8b +; SSE41-NEXT: testq %r10, %r10 ; SSE41-NEXT: setns %bl ; SSE41-NEXT: cmpb %bl, %cl ; SSE41-NEXT: sete %cl -; SSE41-NEXT: andb %bpl, %cl -; SSE41-NEXT: movzbl %cl, %ebp -; SSE41-NEXT: testq %r9, %r9 -; SSE41-NEXT: setns %bl -; SSE41-NEXT: testq %rsi, %rsi -; SSE41-NEXT: setns %cl -; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: sete %r11b -; SSE41-NEXT: addq %r8, %rdi -; SSE41-NEXT: adcq %r9, %rsi -; SSE41-NEXT: setns %bl -; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %cl -; SSE41-NEXT: andb %r11b, %cl +; SSE41-NEXT: andb %r8b, %cl ; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%r10) -; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rax, 24(%r10) -; SSE41-NEXT: movq %rsi, 8(%r10) -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%r11) +; SSE41-NEXT: movq %rdi, (%r11) +; SSE41-NEXT: movq %rbp, 24(%r11) +; SSE41-NEXT: movq %rsi, 8(%r11) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %rbp ; SSE41-NEXT: retq @@ -1764,44 +1707,43 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: pushq %rbp ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: testq %r9, %r9 +; AVX1-NEXT: setns %al +; AVX1-NEXT: testq %rsi, %rsi +; AVX1-NEXT: setns %bl +; AVX1-NEXT: cmpb %al, %bl +; AVX1-NEXT: sete %bpl +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: adcq %r9, %rsi +; AVX1-NEXT: setns %al +; AVX1-NEXT: cmpb %al, %bl +; AVX1-NEXT: setne %al +; AVX1-NEXT: andb %bpl, %al ; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: adcq %r11, %rax +; AVX1-NEXT: movq %rcx, %rbp +; AVX1-NEXT: adcq %r10, %rbp ; AVX1-NEXT: setns %bl ; AVX1-NEXT: testq %rcx, %rcx ; AVX1-NEXT: setns %cl ; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %bpl -; AVX1-NEXT: testq %r11, %r11 +; AVX1-NEXT: setne %r8b +; AVX1-NEXT: testq %r10, %r10 ; AVX1-NEXT: setns %bl ; AVX1-NEXT: cmpb %bl, %cl ; AVX1-NEXT: sete %cl -; AVX1-NEXT: andb %bpl, %cl -; AVX1-NEXT: movzbl %cl, %ebp -; AVX1-NEXT: testq %r9, %r9 -; AVX1-NEXT: setns %bl -; AVX1-NEXT: testq %rsi, %rsi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: sete %r11b -; AVX1-NEXT: addq %r8, %rdi -; AVX1-NEXT: adcq %r9, %rsi -; AVX1-NEXT: setns %bl -; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %cl -; AVX1-NEXT: andb %r11b, %cl +; AVX1-NEXT: andb %r8b, %cl ; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %rdx, 16(%r10) -; AVX1-NEXT: movq %rdi, (%r10) -; AVX1-NEXT: movq %rax, 24(%r10) -; AVX1-NEXT: movq %rsi, 8(%r10) -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, 16(%r11) +; AVX1-NEXT: movq %rdi, (%r11) +; AVX1-NEXT: movq %rbp, 24(%r11) +; AVX1-NEXT: movq %rsi, 8(%r11) ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -1810,44 +1752,43 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: setns %al +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: setns %bl +; AVX2-NEXT: cmpb %al, %bl +; AVX2-NEXT: sete %bpl +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: adcq %r9, %rsi +; AVX2-NEXT: setns %al +; AVX2-NEXT: cmpb %al, %bl +; AVX2-NEXT: setne %al +; AVX2-NEXT: andb %bpl, %al ; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: adcq %r11, %rax +; AVX2-NEXT: movq %rcx, %rbp +; AVX2-NEXT: adcq %r10, %rbp ; AVX2-NEXT: setns %bl ; AVX2-NEXT: testq %rcx, %rcx ; AVX2-NEXT: setns %cl ; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %bpl -; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: setne %r8b +; AVX2-NEXT: testq %r10, %r10 ; AVX2-NEXT: setns %bl ; AVX2-NEXT: cmpb %bl, %cl ; AVX2-NEXT: sete %cl -; AVX2-NEXT: andb %bpl, %cl -; AVX2-NEXT: movzbl %cl, %ebp -; AVX2-NEXT: testq %r9, %r9 -; AVX2-NEXT: setns %bl -; AVX2-NEXT: testq %rsi, %rsi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: sete %r11b -; AVX2-NEXT: addq %r8, %rdi -; AVX2-NEXT: adcq %r9, %rsi -; AVX2-NEXT: setns %bl -; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %cl -; AVX2-NEXT: andb %r11b, %cl +; AVX2-NEXT: andb %r8b, %cl ; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %rdx, 16(%r10) -; AVX2-NEXT: movq %rdi, (%r10) -; AVX2-NEXT: movq %rax, 24(%r10) -; AVX2-NEXT: movq %rsi, 8(%r10) -; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, 16(%r11) +; AVX2-NEXT: movq %rdi, (%r11) +; AVX2-NEXT: movq %rbp, 24(%r11) +; AVX2-NEXT: movq %rsi, 8(%r11) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -1888,12 +1829,12 @@ ; AVX512-NEXT: andl $1, %ecx ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: movq %rdx, 16(%r10) ; AVX512-NEXT: movq %rdi, (%r10) ; AVX512-NEXT: movq %r14, 24(%r10) ; AVX512-NEXT: movq %rsi, 8(%r10) -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vec_smulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_smulo.ll +++ llvm/test/CodeGen/X86/vec_smulo.ll @@ -51,238 +51,123 @@ define <2 x i32> @smulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { ; SSE2-LABEL: smulo_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %r8 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %xmm2, %rdx -; SSE2-NEXT: movq %xmm1, %rsi -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: imulq %rdx, %rsi -; SSE2-NEXT: movq $-1, %r9 -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: cmovoq %r9, %rdx -; SSE2-NEXT: movq %rsi, %xmm1 -; SSE2-NEXT: imulq %r8, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: cmovoq %r9, %rax -; SSE2-NEXT: movq %rax, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSSE3-NEXT: movq %xmm1, %r8 -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm0, %rcx -; SSSE3-NEXT: movq %xmm2, %rdx -; SSSE3-NEXT: movq %xmm1, %rsi -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: imulq %rdx, %rsi -; SSSE3-NEXT: movq $-1, %r9 -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: cmovoq %r9, %rdx -; SSSE3-NEXT: movq %rsi, %xmm1 -; SSSE3-NEXT: imulq %r8, %rcx -; SSSE3-NEXT: movq %rcx, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: movq %rdx, %xmm0 -; SSSE3-NEXT: cmovoq %r9, %rax -; SSSE3-NEXT: movq %rax, %xmm3 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: paddd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movq %xmm2, %r8 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: movq %xmm1, %rcx -; SSE41-NEXT: pextrq $1, %xmm2, %rdx -; SSE41-NEXT: pextrq $1, %xmm1, %rsi -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: imulq %rdx, %rsi -; SSE41-NEXT: movq $-1, %r9 -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: cmovoq %r9, %rdx -; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: imulq %r8, %rcx -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmuldq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: movq %rdx, %xmm3 -; SSE41-NEXT: cmovoq %r9, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: smulo_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vmovq %xmm1, %r8 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vpextrq $1, %xmm1, %rdx -; AVX1-NEXT: vpextrq $1, %xmm0, %rsi -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: imulq %rdx, %rsi -; AVX1-NEXT: movq $-1, %r9 -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: cmovoq %r9, %rdx -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: imulq %r8, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: cmovoq %r9, %rax -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: smulo_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vmovq %xmm1, %r8 -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vpextrq $1, %xmm1, %rdx -; AVX2-NEXT: vpextrq $1, %xmm0, %rsi -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: imulq %rdx, %rsi -; AVX2-NEXT: movq $-1, %r9 -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: cmovoq %r9, %rdx -; AVX2-NEXT: vmovq %rsi, %xmm0 -; AVX2-NEXT: imulq %r8, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: cmovoq %r9, %rax -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: smulo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: imulq %rdx, %rsi -; AVX512-NEXT: seto %dl -; AVX512-NEXT: vmovq %rsi, %xmm0 -; AVX512-NEXT: imulq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpneqq %xmm0, %xmm1, %k0 -; AVX512-NEXT: kmovd %edx, %k1 -; AVX512-NEXT: kshiftlw $1, %k1, %k1 -; AVX512-NEXT: seto %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: korw %k1, %k2, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vpmovqd %xmm0, (%rdi) +; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 +; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.smul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 @@ -1735,14 +1620,15 @@ ; SSE2-NEXT: movq $-1, %r9 ; SSE2-NEXT: movl $0, %edx ; SSE2-NEXT: cmovoq %r9, %rdx -; SSE2-NEXT: movq %rdx, %xmm0 +; SSE2-NEXT: movq %rsi, %xmm1 ; SSE2-NEXT: imulq %r8, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq %rdx, %xmm0 ; SSE2-NEXT: cmovoq %r9, %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movq %rsi, %xmm1 -; SSE2-NEXT: movq %rcx, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movq %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movdqa %xmm1, (%rdi) ; SSE2-NEXT: retq ; @@ -1759,14 +1645,15 @@ ; SSSE3-NEXT: movq $-1, %r9 ; SSSE3-NEXT: movl $0, %edx ; SSSE3-NEXT: cmovoq %r9, %rdx -; SSSE3-NEXT: movq %rdx, %xmm0 +; SSSE3-NEXT: movq %rsi, %xmm1 ; SSSE3-NEXT: imulq %r8, %rcx +; SSSE3-NEXT: movq %rcx, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movq %rdx, %xmm0 ; SSSE3-NEXT: cmovoq %r9, %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movq %rsi, %xmm1 -; SSSE3-NEXT: movq %rcx, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movq %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: movdqa %xmm1, (%rdi) ; SSSE3-NEXT: retq ; @@ -1781,15 +1668,16 @@ ; SSE41-NEXT: movq $-1, %r9 ; SSE41-NEXT: movl $0, %edx ; SSE41-NEXT: cmovoq %r9, %rdx -; SSE41-NEXT: movq %rdx, %xmm1 +; SSE41-NEXT: movq %rsi, %xmm0 ; SSE41-NEXT: imulq %r8, %rcx +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movq %rdx, %xmm0 ; SSE41-NEXT: cmovoq %r9, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: movq %rsi, %xmm1 -; SSE41-NEXT: movq %rcx, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: movq %rax, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE41-NEXT: movdqa %xmm1, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: smulo_v2i64: @@ -1803,14 +1691,15 @@ ; AVX1-NEXT: movq $-1, %r9 ; AVX1-NEXT: movl $0, %edx ; AVX1-NEXT: cmovoq %r9, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 +; AVX1-NEXT: vmovq %rsi, %xmm0 ; AVX1-NEXT: imulq %r8, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: cmovoq %r9, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovq %rsi, %xmm1 -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -1825,38 +1714,41 @@ ; AVX2-NEXT: movq $-1, %r9 ; AVX2-NEXT: movl $0, %edx ; AVX2-NEXT: cmovoq %r9, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 +; AVX2-NEXT: vmovq %rsi, %xmm0 ; AVX2-NEXT: imulq %r8, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: cmovoq %r9, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovq %rsi, %xmm1 -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: smulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm1, %rdx +; AVX512-NEXT: vmovq %xmm0, %rsi ; AVX512-NEXT: imulq %rdx, %rsi ; AVX512-NEXT: seto %dl -; AVX512-NEXT: kmovd %edx, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: imulq %rax, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: korw %k0, %k1, %k1 -; AVX512-NEXT: vmovq %rsi, %xmm0 -; AVX512-NEXT: vmovq %rcx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kmovd %edx, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kxorw %k0, %k2, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -2467,20 +2359,17 @@ ; SSE2-NEXT: movq %r12, %rcx ; SSE2-NEXT: callq __muloti4 ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: setne %cl -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: setne %sil -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: sbbl %esi, %esi +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: sbbl %ecx, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %rdx, 24(%r15) ; SSE2-NEXT: movq %rax, 16(%r15) ; SSE2-NEXT: movq %rbp, 8(%r15) ; SSE2-NEXT: movq %r13, (%r15) -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: addq $24, %rsp ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 @@ -2519,20 +2408,17 @@ ; SSSE3-NEXT: movq %r12, %rcx ; SSSE3-NEXT: callq __muloti4 ; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: setne %cl -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: setne %sil -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: sbbl %esi, %esi +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: sbbl %ecx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %rdx, 24(%r15) ; SSSE3-NEXT: movq %rax, 16(%r15) ; SSSE3-NEXT: movq %rbp, 8(%r15) ; SSSE3-NEXT: movq %r13, (%r15) -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: addq $24, %rsp ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 @@ -2571,20 +2457,16 @@ ; SSE41-NEXT: movq %r12, %rcx ; SSE41-NEXT: callq __muloti4 ; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: setne %cl -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: setne %sil -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 +; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: sbbl %esi, %esi +; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: sbbl %ecx, %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %esi, %xmm0 ; SSE41-NEXT: movq %rdx, 24(%r15) ; SSE41-NEXT: movq %rax, 16(%r15) ; SSE41-NEXT: movq %rbp, 8(%r15) ; SSE41-NEXT: movq %r13, (%r15) -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: addq $24, %rsp ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 @@ -2623,20 +2505,16 @@ ; AVX1-NEXT: movq %r12, %rcx ; AVX1-NEXT: callq __muloti4 ; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: setne %cl -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: setne %sil -; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: sbbl %esi, %esi +; AVX1-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: sbbl %ecx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; AVX1-NEXT: movq %rdx, 24(%r15) ; AVX1-NEXT: movq %rax, 16(%r15) ; AVX1-NEXT: movq %rbp, 8(%r15) ; AVX1-NEXT: movq %r13, (%r15) -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 @@ -2675,20 +2553,16 @@ ; AVX2-NEXT: movq %r12, %rcx ; AVX2-NEXT: callq __muloti4 ; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: setne %cl -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: setne %sil -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: sbbl %esi, %esi +; AVX2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: sbbl %ecx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; AVX2-NEXT: movq %rdx, 24(%r15) ; AVX2-NEXT: movq %rax, 16(%r15) ; AVX2-NEXT: movq %rbp, 8(%r15) ; AVX2-NEXT: movq %r13, (%r15) -; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: addq $24, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 @@ -2708,23 +2582,23 @@ ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: subq $24, %rsp ; AVX512-NEXT: movq %r8, %rax -; AVX512-NEXT: movq %rcx, %r14 +; AVX512-NEXT: movq %rcx, %r15 ; AVX512-NEXT: movq %rdx, %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512-NEXT: movq %rax, %rdx ; AVX512-NEXT: movq %r9, %rcx ; AVX512-NEXT: callq __muloti4 -; AVX512-NEXT: movq %rax, %r13 +; AVX512-NEXT: movq %rax, %r14 ; AVX512-NEXT: movq %rdx, %rbp ; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512-NEXT: movq %rbx, %rdi -; AVX512-NEXT: movq %r14, %rsi +; AVX512-NEXT: movq %r15, %rsi ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq %r12, %rcx +; AVX512-NEXT: movq %r13, %rcx ; AVX512-NEXT: callq __muloti4 ; AVX512-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: setne %cl @@ -2735,12 +2609,12 @@ ; AVX512-NEXT: andl $1, %ecx ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 -; AVX512-NEXT: movq %rdx, 24(%r15) -; AVX512-NEXT: movq %rax, 16(%r15) -; AVX512-NEXT: movq %rbp, 8(%r15) -; AVX512-NEXT: movq %r13, (%r15) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: movq %rdx, 24(%r12) +; AVX512-NEXT: movq %rax, 16(%r12) +; AVX512-NEXT: movq %rbp, 8(%r12) +; AVX512-NEXT: movq %r14, (%r12) ; AVX512-NEXT: addq $24, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 Index: llvm/test/CodeGen/X86/vec_ssubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ssubo.ll +++ llvm/test/CodeGen/X86/vec_ssubo.ll @@ -49,134 +49,76 @@ } define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { -; SSE2-LABEL: ssubo_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psubq %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: ssubo_v2i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: psubq %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: movq %xmm1, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ssubo_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: psubq %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE41-NEXT: movq %xmm1, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: ssubo_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: ssubo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512-NEXT: vpmovqd %xmm0, (%rdi) -; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 @@ -1205,38 +1147,38 @@ ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v2i64: @@ -1253,38 +1195,38 @@ ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] +; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v2i64: @@ -1297,12 +1239,12 @@ ; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 @@ -1312,8 +1254,8 @@ ; SSE41-NEXT: pand %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm0, (%rdi) ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -1324,11 +1266,11 @@ ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqq %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pandn %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSE41-NEXT: retq ; ; AVX1-LABEL: ssubo_v2i64: @@ -1346,6 +1288,7 @@ ; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -1364,6 +1307,7 @@ ; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 ; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -1373,13 +1317,13 @@ ; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 ; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 ; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2 ; AVX512-NEXT: kxorw %k2, %k1, %k1 ; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -1665,44 +1609,44 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: sbbq %r11, %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: testq %r9, %r9 +; SSE2-NEXT: setns %al +; SSE2-NEXT: testq %rsi, %rsi ; SSE2-NEXT: setns %bl -; SSE2-NEXT: testq %rcx, %rcx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: cmpb %al, %bl ; SSE2-NEXT: setne %bpl -; SSE2-NEXT: testq %r11, %r11 -; SSE2-NEXT: setns %bl -; SSE2-NEXT: cmpb %bl, %cl -; SSE2-NEXT: setne %cl -; SSE2-NEXT: andb %bpl, %cl -; SSE2-NEXT: movzbl %cl, %ebp -; SSE2-NEXT: testq %r9, %r9 +; SSE2-NEXT: subq %r8, %rdi +; SSE2-NEXT: sbbq %r9, %rsi +; SSE2-NEXT: setns %al +; SSE2-NEXT: cmpb %al, %bl +; SSE2-NEXT: setne %al +; SSE2-NEXT: andb %bpl, %al +; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq %rcx, %rbp +; SSE2-NEXT: sbbq %r10, %rbp ; SSE2-NEXT: setns %bl -; SSE2-NEXT: testq %rsi, %rsi +; SSE2-NEXT: testq %rcx, %rcx ; SSE2-NEXT: setns %cl ; SSE2-NEXT: cmpb %bl, %cl -; SSE2-NEXT: setne %r11b -; SSE2-NEXT: subq %r8, %rdi -; SSE2-NEXT: sbbq %r9, %rsi +; SSE2-NEXT: setne %r8b +; SSE2-NEXT: testq %r10, %r10 ; SSE2-NEXT: setns %bl ; SSE2-NEXT: cmpb %bl, %cl ; SSE2-NEXT: setne %cl -; SSE2-NEXT: andb %r11b, %cl +; SSE2-NEXT: andb %r8b, %cl ; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: pinsrw $4, %ebp, %xmm0 -; SSE2-NEXT: movq %rdx, 16(%r10) -; SSE2-NEXT: movq %rdi, (%r10) -; SSE2-NEXT: movq %rax, 24(%r10) -; SSE2-NEXT: movq %rsi, 8(%r10) -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq %rdx, 16(%r11) +; SSE2-NEXT: movq %rdi, (%r11) +; SSE2-NEXT: movq %rbp, 24(%r11) +; SSE2-NEXT: movq %rsi, 8(%r11) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq @@ -1711,44 +1655,44 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: pushq %rbp ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: sbbq %r11, %rax +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: testq %r9, %r9 +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: testq %rsi, %rsi ; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: testq %rcx, %rcx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: cmpb %al, %bl ; SSSE3-NEXT: setne %bpl -; SSSE3-NEXT: testq %r11, %r11 -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: cmpb %bl, %cl -; SSSE3-NEXT: setne %cl -; SSSE3-NEXT: andb %bpl, %cl -; SSSE3-NEXT: movzbl %cl, %ebp -; SSSE3-NEXT: testq %r9, %r9 +; SSSE3-NEXT: subq %r8, %rdi +; SSSE3-NEXT: sbbq %r9, %rsi +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: cmpb %al, %bl +; SSSE3-NEXT: setne %al +; SSSE3-NEXT: andb %bpl, %al +; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT: movq %rcx, %rbp +; SSSE3-NEXT: sbbq %r10, %rbp ; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: testq %rsi, %rsi +; SSSE3-NEXT: testq %rcx, %rcx ; SSSE3-NEXT: setns %cl ; SSSE3-NEXT: cmpb %bl, %cl -; SSSE3-NEXT: setne %r11b -; SSSE3-NEXT: subq %r8, %rdi -; SSSE3-NEXT: sbbq %r9, %rsi +; SSSE3-NEXT: setne %r8b +; SSSE3-NEXT: testq %r10, %r10 ; SSSE3-NEXT: setns %bl ; SSSE3-NEXT: cmpb %bl, %cl ; SSSE3-NEXT: setne %cl -; SSSE3-NEXT: andb %r11b, %cl +; SSSE3-NEXT: andb %r8b, %cl ; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0 -; SSSE3-NEXT: movq %rdx, 16(%r10) -; SSSE3-NEXT: movq %rdi, (%r10) -; SSSE3-NEXT: movq %rax, 24(%r10) -; SSSE3-NEXT: movq %rsi, 8(%r10) -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: negl %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: negl %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %rdx, 16(%r11) +; SSSE3-NEXT: movq %rdi, (%r11) +; SSSE3-NEXT: movq %rbp, 24(%r11) +; SSSE3-NEXT: movq %rsi, 8(%r11) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %rbp ; SSSE3-NEXT: retq @@ -1757,44 +1701,43 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pushq %rbp ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: sbbq %r11, %rax +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: testq %r9, %r9 +; SSE41-NEXT: setns %al +; SSE41-NEXT: testq %rsi, %rsi ; SSE41-NEXT: setns %bl -; SSE41-NEXT: testq %rcx, %rcx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: cmpb %al, %bl ; SSE41-NEXT: setne %bpl -; SSE41-NEXT: testq %r11, %r11 -; SSE41-NEXT: setns %bl -; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %cl -; SSE41-NEXT: andb %bpl, %cl -; SSE41-NEXT: movzbl %cl, %ebp -; SSE41-NEXT: testq %r9, %r9 +; SSE41-NEXT: subq %r8, %rdi +; SSE41-NEXT: sbbq %r9, %rsi +; SSE41-NEXT: setns %al +; SSE41-NEXT: cmpb %al, %bl +; SSE41-NEXT: setne %al +; SSE41-NEXT: andb %bpl, %al +; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: movq %rcx, %rbp +; SSE41-NEXT: sbbq %r10, %rbp ; SSE41-NEXT: setns %bl -; SSE41-NEXT: testq %rsi, %rsi +; SSE41-NEXT: testq %rcx, %rcx ; SSE41-NEXT: setns %cl ; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %r11b -; SSE41-NEXT: subq %r8, %rdi -; SSE41-NEXT: sbbq %r9, %rsi +; SSE41-NEXT: setne %r8b +; SSE41-NEXT: testq %r10, %r10 ; SSE41-NEXT: setns %bl ; SSE41-NEXT: cmpb %bl, %cl ; SSE41-NEXT: setne %cl -; SSE41-NEXT: andb %r11b, %cl +; SSE41-NEXT: andb %r8b, %cl ; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%r10) -; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rax, 24(%r10) -; SSE41-NEXT: movq %rsi, 8(%r10) -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%r11) +; SSE41-NEXT: movq %rdi, (%r11) +; SSE41-NEXT: movq %rbp, 24(%r11) +; SSE41-NEXT: movq %rsi, 8(%r11) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %rbp ; SSE41-NEXT: retq @@ -1803,44 +1746,43 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: pushq %rbp ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: sbbq %r11, %rax +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: testq %r9, %r9 +; AVX1-NEXT: setns %al +; AVX1-NEXT: testq %rsi, %rsi ; AVX1-NEXT: setns %bl -; AVX1-NEXT: testq %rcx, %rcx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: cmpb %al, %bl ; AVX1-NEXT: setne %bpl -; AVX1-NEXT: testq %r11, %r11 -; AVX1-NEXT: setns %bl -; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %cl -; AVX1-NEXT: andb %bpl, %cl -; AVX1-NEXT: movzbl %cl, %ebp -; AVX1-NEXT: testq %r9, %r9 +; AVX1-NEXT: subq %r8, %rdi +; AVX1-NEXT: sbbq %r9, %rsi +; AVX1-NEXT: setns %al +; AVX1-NEXT: cmpb %al, %bl +; AVX1-NEXT: setne %al +; AVX1-NEXT: andb %bpl, %al +; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT: movq %rcx, %rbp +; AVX1-NEXT: sbbq %r10, %rbp ; AVX1-NEXT: setns %bl -; AVX1-NEXT: testq %rsi, %rsi +; AVX1-NEXT: testq %rcx, %rcx ; AVX1-NEXT: setns %cl ; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %r11b -; AVX1-NEXT: subq %r8, %rdi -; AVX1-NEXT: sbbq %r9, %rsi +; AVX1-NEXT: setne %r8b +; AVX1-NEXT: testq %r10, %r10 ; AVX1-NEXT: setns %bl ; AVX1-NEXT: cmpb %bl, %cl ; AVX1-NEXT: setne %cl -; AVX1-NEXT: andb %r11b, %cl +; AVX1-NEXT: andb %r8b, %cl ; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %rdx, 16(%r10) -; AVX1-NEXT: movq %rdi, (%r10) -; AVX1-NEXT: movq %rax, 24(%r10) -; AVX1-NEXT: movq %rsi, 8(%r10) -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: negl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, 16(%r11) +; AVX1-NEXT: movq %rdi, (%r11) +; AVX1-NEXT: movq %rbp, 24(%r11) +; AVX1-NEXT: movq %rsi, 8(%r11) ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -1849,44 +1791,43 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: sbbq %r11, %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: setns %al +; AVX2-NEXT: testq %rsi, %rsi ; AVX2-NEXT: setns %bl -; AVX2-NEXT: testq %rcx, %rcx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: cmpb %al, %bl ; AVX2-NEXT: setne %bpl -; AVX2-NEXT: testq %r11, %r11 -; AVX2-NEXT: setns %bl -; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %cl -; AVX2-NEXT: andb %bpl, %cl -; AVX2-NEXT: movzbl %cl, %ebp -; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: subq %r8, %rdi +; AVX2-NEXT: sbbq %r9, %rsi +; AVX2-NEXT: setns %al +; AVX2-NEXT: cmpb %al, %bl +; AVX2-NEXT: setne %al +; AVX2-NEXT: andb %bpl, %al +; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq %rcx, %rbp +; AVX2-NEXT: sbbq %r10, %rbp ; AVX2-NEXT: setns %bl -; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: testq %rcx, %rcx ; AVX2-NEXT: setns %cl ; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %r11b -; AVX2-NEXT: subq %r8, %rdi -; AVX2-NEXT: sbbq %r9, %rsi +; AVX2-NEXT: setne %r8b +; AVX2-NEXT: testq %r10, %r10 ; AVX2-NEXT: setns %bl ; AVX2-NEXT: cmpb %bl, %cl ; AVX2-NEXT: setne %cl -; AVX2-NEXT: andb %r11b, %cl +; AVX2-NEXT: andb %r8b, %cl ; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %rdx, 16(%r10) -; AVX2-NEXT: movq %rdi, (%r10) -; AVX2-NEXT: movq %rax, 24(%r10) -; AVX2-NEXT: movq %rsi, 8(%r10) -; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, 16(%r11) +; AVX2-NEXT: movq %rdi, (%r11) +; AVX2-NEXT: movq %rbp, 24(%r11) +; AVX2-NEXT: movq %rsi, 8(%r11) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -1927,12 +1868,12 @@ ; AVX512-NEXT: andl $1, %ecx ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: movq %rdx, 16(%r10) ; AVX512-NEXT: movq %rdi, (%r10) ; AVX512-NEXT: movq %r14, 24(%r10) ; AVX512-NEXT: movq %rsi, 8(%r10) -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vec_uaddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_uaddo.ll +++ llvm/test/CodeGen/X86/vec_uaddo.ll @@ -47,91 +47,61 @@ define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { ; SSE2-LABEL: uaddo_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: movq %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: paddq %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: movq %xmm1, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movq %xmm1, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: uaddo_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: uaddo_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: uaddo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpmovqd %xmm0, (%rdi) -; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 @@ -924,10 +894,11 @@ ; SSE-NEXT: pcmpgtd %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; @@ -938,6 +909,7 @@ ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -948,6 +920,7 @@ ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -955,9 +928,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 -; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -1225,21 +1198,17 @@ ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %r11d +; SSE2-NEXT: sbbl %eax, %eax ; SSE2-NEXT: addq %r8, %rdi ; SSE2-NEXT: adcq %r9, %rsi -; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: sbbl %eax, %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %rdx, 16(%r10) ; SSE2-NEXT: movq %rdi, (%r10) ; SSE2-NEXT: movq %rcx, 24(%r10) ; SSE2-NEXT: movq %rsi, 8(%r10) -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v2i128: @@ -1247,21 +1216,17 @@ ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %r11d +; SSSE3-NEXT: sbbl %eax, %eax ; SSSE3-NEXT: addq %r8, %rdi ; SSSE3-NEXT: adcq %r9, %rsi -; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: sbbl %eax, %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %rdx, 16(%r10) ; SSSE3-NEXT: movq %rdi, (%r10) ; SSSE3-NEXT: movq %rcx, 24(%r10) ; SSSE3-NEXT: movq %rsi, 8(%r10) -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v2i128: @@ -1269,21 +1234,16 @@ ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %r11d +; SSE41-NEXT: sbbl %r11d, %r11d ; SSE41-NEXT: addq %r8, %rdi ; SSE41-NEXT: adcq %r9, %rsi -; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: sbbl %eax, %eax ; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pinsrb $8, %r11d, %xmm0 +; SSE41-NEXT: pinsrd $1, %r11d, %xmm0 ; SSE41-NEXT: movq %rdx, 16(%r10) ; SSE41-NEXT: movq %rdi, (%r10) ; SSE41-NEXT: movq %rcx, 24(%r10) ; SSE41-NEXT: movq %rsi, 8(%r10) -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: retq ; ; AVX1-LABEL: uaddo_v2i128: @@ -1291,21 +1251,16 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX1-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: setb %al -; AVX1-NEXT: movzbl %al, %r11d +; AVX1-NEXT: sbbl %r11d, %r11d ; AVX1-NEXT: addq %r8, %rdi ; AVX1-NEXT: adcq %r9, %rsi -; AVX1-NEXT: setb %al -; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 ; AVX1-NEXT: movq %rdx, 16(%r10) ; AVX1-NEXT: movq %rdi, (%r10) ; AVX1-NEXT: movq %rcx, 24(%r10) ; AVX1-NEXT: movq %rsi, 8(%r10) -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uaddo_v2i128: @@ -1313,21 +1268,16 @@ ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: setb %al -; AVX2-NEXT: movzbl %al, %r11d +; AVX2-NEXT: sbbl %r11d, %r11d ; AVX2-NEXT: addq %r8, %rdi ; AVX2-NEXT: adcq %r9, %rsi -; AVX2-NEXT: setb %al -; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 ; AVX2-NEXT: movq %rdx, 16(%r10) ; AVX2-NEXT: movq %rdi, (%r10) ; AVX2-NEXT: movq %rcx, 24(%r10) ; AVX2-NEXT: movq %rsi, 8(%r10) -; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: uaddo_v2i128: @@ -1344,12 +1294,12 @@ ; AVX512-NEXT: andl $1, %eax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: movq %rdx, 16(%r10) ; AVX512-NEXT: movq %rdi, (%r10) ; AVX512-NEXT: movq %rcx, 24(%r10) ; AVX512-NEXT: movq %rsi, 8(%r10) -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 Index: llvm/test/CodeGen/X86/vec_umulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_umulo.ll +++ llvm/test/CodeGen/X86/vec_umulo.ll @@ -57,205 +57,105 @@ define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { ; SSE2-LABEL: umulo_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm3, %r8 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm1, %rdx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: mulq %rdx -; SSE2-NEXT: movq $-1, %r9 -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: cmovoq %r9, %rcx -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: mulq %r10 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: cmovoq %r9, %rsi -; SSE2-NEXT: movq %rsi, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm3, %r8 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %r10 -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: movq %xmm1, %rdx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: mulq %rdx -; SSSE3-NEXT: movq $-1, %r9 -; SSSE3-NEXT: movl $0, %ecx -; SSSE3-NEXT: cmovoq %r9, %rcx -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: mulq %r10 -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSSE3-NEXT: psrlq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: movq %rcx, %xmm0 -; SSSE3-NEXT: cmovoq %r9, %rsi -; SSSE3-NEXT: movq %rsi, %xmm3 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: movq %xmm0, %r8 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movq %xmm1, %rcx -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: pextrq $1, %xmm1, %rdx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: mulq %rdx -; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: movq $-1, %r10 -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: cmovoq %r10, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: mulq %rcx -; SSE41-NEXT: cmovoq %r10, %rsi -; SSE41-NEXT: movq %rsi, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: movq %r9, %xmm0 -; SSE41-NEXT: movq %rax, %xmm3 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE41-NEXT: psrlq $32, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movq %xmm4, (%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: umulo_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vmovq %xmm0, %r8 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpextrq $1, %xmm1, %rdx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: movq %rax, %r9 -; AVX1-NEXT: movq $-1, %r10 -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: cmovoq %r10, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: movq %r8, %rax -; AVX1-NEXT: mulq %rcx -; AVX1-NEXT: cmovoq %r10, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovq %r9, %xmm1 -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vmovq %xmm0, %r8 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rdx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq $-1, %r10 -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: cmovoq %r10, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: movq %r8, %rax -; AVX2-NEXT: mulq %rcx -; AVX2-NEXT: cmovoq %r10, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovq %r9, %xmm1 -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: umulo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: mulq %rdx -; AVX512-NEXT: seto %r8b -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: mulq %rsi -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 -; AVX512-NEXT: vptestmq %xmm1, %xmm1, %k0 -; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kshiftlw $1, %k1, %k1 -; AVX512-NEXT: seto %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: korw %k1, %k2, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vpmovqd %xmm0, (%rdi) +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 +; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 @@ -1509,150 +1409,152 @@ ; SSE2-LABEL: umulo_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %r9 +; SSE2-NEXT: movq %xmm2, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rsi +; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movq %xmm1, %rdx ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: mulq %rdx -; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: movq $-1, %r10 -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: cmovoq %r10, %rax +; SSE2-NEXT: movq $-1, %r9 +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: cmovoq %r9, %rsi +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movq %r8, %rax +; SSE2-NEXT: mulq %r10 ; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %r9, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: cmovoq %r10, %rcx -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movq %r8, %xmm1 -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq %rsi, %xmm0 +; SSE2-NEXT: cmovoq %r9, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movdqa %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v2i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %r9 +; SSSE3-NEXT: movq %xmm2, %r8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %rsi +; SSSE3-NEXT: movq %xmm2, %r10 ; SSSE3-NEXT: movq %xmm0, %rax ; SSSE3-NEXT: movq %xmm1, %rdx ; SSSE3-NEXT: xorl %ecx, %ecx ; SSSE3-NEXT: mulq %rdx -; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: movq $-1, %r10 -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: cmovoq %r10, %rax +; SSSE3-NEXT: movq $-1, %r9 +; SSSE3-NEXT: movl $0, %esi +; SSSE3-NEXT: cmovoq %r9, %rsi +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: movq %r8, %rax +; SSSE3-NEXT: mulq %r10 ; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: movq %r9, %rax -; SSSE3-NEXT: mulq %rsi -; SSSE3-NEXT: cmovoq %r10, %rcx -; SSSE3-NEXT: movq %rcx, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movq %r8, %xmm1 -; SSSE3-NEXT: movq %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movq %rsi, %xmm0 +; SSSE3-NEXT: cmovoq %r9, %rcx +; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: movdqa %xmm1, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm1, %r9 +; SSE41-NEXT: movq %xmm0, %r10 +; SSE41-NEXT: movq %xmm1, %r8 ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: pextrq $1, %xmm1, %rdx ; SSE41-NEXT: xorl %esi, %esi ; SSE41-NEXT: mulq %rdx -; SSE41-NEXT: movq %rax, %r8 -; SSE41-NEXT: movq $-1, %r10 -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: cmovoq %r10, %rax +; SSE41-NEXT: movq $-1, %r9 +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: cmovoq %r9, %rcx +; SSE41-NEXT: movq %rax, %xmm0 +; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %xmm1 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: cmovoq %r10, %rsi -; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: movq %r8, %xmm1 -; SSE41-NEXT: movq %rax, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: cmovoq %r9, %rsi +; SSE41-NEXT: movq %rsi, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE41-NEXT: movdqa %xmm1, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: umulo_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovq %xmm1, %r9 +; AVX1-NEXT: vmovq %xmm0, %r10 +; AVX1-NEXT: vmovq %xmm1, %r8 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vpextrq $1, %xmm1, %rdx ; AVX1-NEXT: xorl %esi, %esi ; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: movq $-1, %r10 -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: cmovoq %r10, %rax +; AVX1-NEXT: movq $-1, %r9 +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: cmovoq %r9, %rcx ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %r9 -; AVX1-NEXT: cmovoq %r10, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovq %r8, %xmm1 -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: movq %r10, %rax +; AVX1-NEXT: mulq %r8 +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: cmovoq %r9, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovq %xmm1, %r9 +; AVX2-NEXT: vmovq %xmm0, %r10 +; AVX2-NEXT: vmovq %xmm1, %r8 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vpextrq $1, %xmm1, %rdx ; AVX2-NEXT: xorl %esi, %esi ; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq $-1, %r10 -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: cmovoq %r10, %rax +; AVX2-NEXT: movq $-1, %r9 +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: cmovoq %r9, %rcx ; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %r9 -; AVX2-NEXT: cmovoq %r10, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovq %r8, %xmm1 -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: movq %r10, %rax +; AVX2-NEXT: mulq %r8 +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: cmovoq %r9, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: umulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovq %xmm1, %rdx ; AVX512-NEXT: mulq %rdx -; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: seto %r9b +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: mulq %r8 +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: mulq %rsi -; AVX512-NEXT: seto %cl -; AVX512-NEXT: andl $1, %ecx -; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: korw %k0, %k1, %k1 -; AVX512-NEXT: vmovq %r8, %xmm0 -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: kmovd %r9d, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kxorw %k0, %k2, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -2167,66 +2069,67 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: movq %rdi, %r11 +; SSE2-NEXT: movq %r9, %r10 +; SSE2-NEXT: movq %rcx, %r12 +; SSE2-NEXT: movq %rdx, %r11 +; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSE2-NEXT: testq %r10, %r10 -; SSE2-NEXT: setne %dl -; SSE2-NEXT: testq %rcx, %rcx -; SSE2-NEXT: setne %r13b -; SSE2-NEXT: andb %dl, %r13b -; SSE2-NEXT: mulq %r15 -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: seto %bpl -; SSE2-NEXT: movq %r10, %rax -; SSE2-NEXT: mulq %r12 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: seto %cl -; SSE2-NEXT: orb %bpl, %cl -; SSE2-NEXT: addq %rdi, %rbx -; SSE2-NEXT: movq %r12, %rax -; SSE2-NEXT: mulq %r15 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %rdx, %r15 -; SSE2-NEXT: addq %rbx, %r15 -; SSE2-NEXT: setb %al -; SSE2-NEXT: orb %cl, %al -; SSE2-NEXT: orb %r13b, %al -; SSE2-NEXT: movzbl %al, %ebp -; SSE2-NEXT: testq %r9, %r9 -; SSE2-NEXT: setne %al +; SSE2-NEXT: setne %cl ; SSE2-NEXT: testq %rsi, %rsi ; SSE2-NEXT: setne %r13b -; SSE2-NEXT: andb %al, %r13b -; SSE2-NEXT: movq %rsi, %rax +; SSE2-NEXT: andb %cl, %r13b ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: seto %r12b -; SSE2-NEXT: movq %r9, %rax -; SSE2-NEXT: mulq %r11 -; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: seto %bpl +; SSE2-NEXT: movq %r10, %rax +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: seto %bl -; SSE2-NEXT: orb %r12b, %bl -; SSE2-NEXT: addq %rsi, %rdi -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: orb %bpl, %bl +; SSE2-NEXT: addq %rsi, %rcx +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: addq %rdi, %rdx +; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: addq %rcx, %rsi ; SSE2-NEXT: setb %cl ; SSE2-NEXT: orb %bl, %cl ; SSE2-NEXT: orb %r13b, %cl +; SSE2-NEXT: testq %r9, %r9 +; SSE2-NEXT: setne %al +; SSE2-NEXT: testq %r12, %r12 +; SSE2-NEXT: setne %r8b +; SSE2-NEXT: andb %al, %r8b +; SSE2-NEXT: movq %r12, %rax +; SSE2-NEXT: mulq %r15 +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: seto %r10b +; SSE2-NEXT: movq %r9, %rax +; SSE2-NEXT: mulq %r11 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: seto %r9b +; SSE2-NEXT: orb %r10b, %r9b +; SSE2-NEXT: addq %rbp, %rbx +; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: mulq %r15 +; SSE2-NEXT: addq %rbx, %rdx +; SSE2-NEXT: setb %bl +; SSE2-NEXT: orb %r9b, %bl +; SSE2-NEXT: orb %r8b, %bl +; SSE2-NEXT: movzbl %bl, %ebp +; SSE2-NEXT: negl %ebp +; SSE2-NEXT: movd %ebp, %xmm1 ; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: pinsrw $4, %ebp, %xmm0 -; SSE2-NEXT: movq %r10, 16(%r14) -; SSE2-NEXT: movq %rax, (%r14) -; SSE2-NEXT: movq %r15, 24(%r14) -; SSE2-NEXT: movq %rdx, 8(%r14) -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq %rax, 16(%r14) +; SSE2-NEXT: movq %rdi, (%r14) +; SSE2-NEXT: movq %rdx, 24(%r14) +; SSE2-NEXT: movq %rsi, 8(%r14) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -2243,66 +2146,67 @@ ; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: movq %rdx, %r12 -; SSSE3-NEXT: movq %rdi, %r11 +; SSSE3-NEXT: movq %r9, %r10 +; SSSE3-NEXT: movq %rcx, %r12 +; SSSE3-NEXT: movq %rdx, %r11 +; SSSE3-NEXT: movq %rsi, %rax ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSSE3-NEXT: testq %r10, %r10 -; SSSE3-NEXT: setne %dl -; SSSE3-NEXT: testq %rcx, %rcx -; SSSE3-NEXT: setne %r13b -; SSSE3-NEXT: andb %dl, %r13b -; SSSE3-NEXT: mulq %r15 -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: seto %bpl -; SSSE3-NEXT: movq %r10, %rax -; SSSE3-NEXT: mulq %r12 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: seto %cl -; SSSE3-NEXT: orb %bpl, %cl -; SSSE3-NEXT: addq %rdi, %rbx -; SSSE3-NEXT: movq %r12, %rax -; SSSE3-NEXT: mulq %r15 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %rdx, %r15 -; SSSE3-NEXT: addq %rbx, %r15 -; SSSE3-NEXT: setb %al -; SSSE3-NEXT: orb %cl, %al -; SSSE3-NEXT: orb %r13b, %al -; SSSE3-NEXT: movzbl %al, %ebp -; SSSE3-NEXT: testq %r9, %r9 -; SSSE3-NEXT: setne %al +; SSSE3-NEXT: setne %cl ; SSSE3-NEXT: testq %rsi, %rsi ; SSSE3-NEXT: setne %r13b -; SSSE3-NEXT: andb %al, %r13b -; SSSE3-NEXT: movq %rsi, %rax +; SSSE3-NEXT: andb %cl, %r13b ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: seto %r12b -; SSSE3-NEXT: movq %r9, %rax -; SSSE3-NEXT: mulq %r11 -; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: seto %bpl +; SSSE3-NEXT: movq %r10, %rax +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rax, %rcx ; SSSE3-NEXT: seto %bl -; SSSE3-NEXT: orb %r12b, %bl -; SSSE3-NEXT: addq %rsi, %rdi -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: orb %bpl, %bl +; SSSE3-NEXT: addq %rsi, %rcx +; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: addq %rdi, %rdx +; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: movq %rdx, %rsi +; SSSE3-NEXT: addq %rcx, %rsi ; SSSE3-NEXT: setb %cl ; SSSE3-NEXT: orb %bl, %cl ; SSSE3-NEXT: orb %r13b, %cl +; SSSE3-NEXT: testq %r9, %r9 +; SSSE3-NEXT: setne %al +; SSSE3-NEXT: testq %r12, %r12 +; SSSE3-NEXT: setne %r8b +; SSSE3-NEXT: andb %al, %r8b +; SSSE3-NEXT: movq %r12, %rax +; SSSE3-NEXT: mulq %r15 +; SSSE3-NEXT: movq %rax, %rbp +; SSSE3-NEXT: seto %r10b +; SSSE3-NEXT: movq %r9, %rax +; SSSE3-NEXT: mulq %r11 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: seto %r9b +; SSSE3-NEXT: orb %r10b, %r9b +; SSSE3-NEXT: addq %rbp, %rbx +; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: mulq %r15 +; SSSE3-NEXT: addq %rbx, %rdx +; SSSE3-NEXT: setb %bl +; SSSE3-NEXT: orb %r9b, %bl +; SSSE3-NEXT: orb %r8b, %bl +; SSSE3-NEXT: movzbl %bl, %ebp +; SSSE3-NEXT: negl %ebp +; SSSE3-NEXT: movd %ebp, %xmm1 ; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0 -; SSSE3-NEXT: movq %r10, 16(%r14) -; SSSE3-NEXT: movq %rax, (%r14) -; SSSE3-NEXT: movq %r15, 24(%r14) -; SSSE3-NEXT: movq %rdx, 8(%r14) -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %rax, 16(%r14) +; SSSE3-NEXT: movq %rdi, (%r14) +; SSSE3-NEXT: movq %rdx, 24(%r14) +; SSSE3-NEXT: movq %rsi, 8(%r14) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -2319,66 +2223,66 @@ ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: movq %rdx, %r12 -; SSE41-NEXT: movq %rdi, %r11 +; SSE41-NEXT: movq %r9, %r10 +; SSE41-NEXT: movq %rcx, %r12 +; SSE41-NEXT: movq %rdx, %r11 +; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSE41-NEXT: testq %r10, %r10 -; SSE41-NEXT: setne %dl -; SSE41-NEXT: testq %rcx, %rcx -; SSE41-NEXT: setne %r13b -; SSE41-NEXT: andb %dl, %r13b -; SSE41-NEXT: mulq %r15 -; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: seto %bpl -; SSE41-NEXT: movq %r10, %rax -; SSE41-NEXT: mulq %r12 -; SSE41-NEXT: movq %rax, %rbx -; SSE41-NEXT: seto %cl -; SSE41-NEXT: orb %bpl, %cl -; SSE41-NEXT: addq %rdi, %rbx -; SSE41-NEXT: movq %r12, %rax -; SSE41-NEXT: mulq %r15 -; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: movq %rdx, %r15 -; SSE41-NEXT: addq %rbx, %r15 -; SSE41-NEXT: setb %al -; SSE41-NEXT: orb %cl, %al -; SSE41-NEXT: orb %r13b, %al -; SSE41-NEXT: movzbl %al, %ebp -; SSE41-NEXT: testq %r9, %r9 -; SSE41-NEXT: setne %al +; SSE41-NEXT: setne %cl ; SSE41-NEXT: testq %rsi, %rsi ; SSE41-NEXT: setne %r13b -; SSE41-NEXT: andb %al, %r13b -; SSE41-NEXT: movq %rsi, %rax +; SSE41-NEXT: andb %cl, %r13b ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: seto %r12b -; SSE41-NEXT: movq %r9, %rax -; SSE41-NEXT: mulq %r11 -; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: seto %bpl +; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: seto %bl -; SSE41-NEXT: orb %r12b, %bl -; SSE41-NEXT: addq %rsi, %rdi -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: orb %bpl, %bl +; SSE41-NEXT: addq %rsi, %rcx +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: addq %rdi, %rdx +; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: movq %rdx, %rsi +; SSE41-NEXT: addq %rcx, %rsi ; SSE41-NEXT: setb %cl ; SSE41-NEXT: orb %bl, %cl ; SSE41-NEXT: orb %r13b, %cl +; SSE41-NEXT: testq %r9, %r9 +; SSE41-NEXT: setne %al +; SSE41-NEXT: testq %r12, %r12 +; SSE41-NEXT: setne %r8b +; SSE41-NEXT: andb %al, %r8b +; SSE41-NEXT: movq %r12, %rax +; SSE41-NEXT: mulq %r15 +; SSE41-NEXT: movq %rax, %rbp +; SSE41-NEXT: seto %r10b +; SSE41-NEXT: movq %r9, %rax +; SSE41-NEXT: mulq %r11 +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: seto %r9b +; SSE41-NEXT: orb %r10b, %r9b +; SSE41-NEXT: addq %rbp, %rbx +; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: mulq %r15 +; SSE41-NEXT: addq %rbx, %rdx +; SSE41-NEXT: setb %bl +; SSE41-NEXT: orb %r9b, %bl +; SSE41-NEXT: orb %r8b, %bl +; SSE41-NEXT: movzbl %bl, %ebp +; SSE41-NEXT: negl %ebp ; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: negl %ecx ; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 -; SSE41-NEXT: movq %r10, 16(%r14) -; SSE41-NEXT: movq %rax, (%r14) -; SSE41-NEXT: movq %r15, 24(%r14) -; SSE41-NEXT: movq %rdx, 8(%r14) -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pinsrd $1, %ebp, %xmm0 +; SSE41-NEXT: movq %rax, 16(%r14) +; SSE41-NEXT: movq %rdi, (%r14) +; SSE41-NEXT: movq %rdx, 24(%r14) +; SSE41-NEXT: movq %rsi, 8(%r14) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -2395,66 +2299,66 @@ ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: movq %rdx, %r12 -; AVX1-NEXT: movq %rdi, %r11 +; AVX1-NEXT: movq %r9, %r10 +; AVX1-NEXT: movq %rcx, %r12 +; AVX1-NEXT: movq %rdx, %r11 +; AVX1-NEXT: movq %rsi, %rax ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; AVX1-NEXT: testq %r10, %r10 -; AVX1-NEXT: setne %dl -; AVX1-NEXT: testq %rcx, %rcx +; AVX1-NEXT: setne %cl +; AVX1-NEXT: testq %rsi, %rsi ; AVX1-NEXT: setne %r13b -; AVX1-NEXT: andb %dl, %r13b -; AVX1-NEXT: mulq %r15 -; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: andb %cl, %r13b +; AVX1-NEXT: mulq %r8 +; AVX1-NEXT: movq %rax, %rsi ; AVX1-NEXT: seto %bpl ; AVX1-NEXT: movq %r10, %rax -; AVX1-NEXT: mulq %r12 -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: seto %cl -; AVX1-NEXT: orb %bpl, %cl -; AVX1-NEXT: addq %rdi, %rbx -; AVX1-NEXT: movq %r12, %rax -; AVX1-NEXT: mulq %r15 -; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: movq %rdx, %r15 -; AVX1-NEXT: addq %rbx, %r15 -; AVX1-NEXT: setb %al -; AVX1-NEXT: orb %cl, %al -; AVX1-NEXT: orb %r13b, %al -; AVX1-NEXT: movzbl %al, %ebp +; AVX1-NEXT: mulq %rdi +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: seto %bl +; AVX1-NEXT: orb %bpl, %bl +; AVX1-NEXT: addq %rsi, %rcx +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: mulq %r8 +; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: addq %rcx, %rsi +; AVX1-NEXT: setb %cl +; AVX1-NEXT: orb %bl, %cl +; AVX1-NEXT: orb %r13b, %cl ; AVX1-NEXT: testq %r9, %r9 ; AVX1-NEXT: setne %al -; AVX1-NEXT: testq %rsi, %rsi -; AVX1-NEXT: setne %r13b -; AVX1-NEXT: andb %al, %r13b -; AVX1-NEXT: movq %rsi, %rax -; AVX1-NEXT: mulq %r8 -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: seto %r12b +; AVX1-NEXT: testq %r12, %r12 +; AVX1-NEXT: setne %r8b +; AVX1-NEXT: andb %al, %r8b +; AVX1-NEXT: movq %r12, %rax +; AVX1-NEXT: mulq %r15 +; AVX1-NEXT: movq %rax, %rbp +; AVX1-NEXT: seto %r10b ; AVX1-NEXT: movq %r9, %rax ; AVX1-NEXT: mulq %r11 -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: seto %cl -; AVX1-NEXT: orb %r12b, %cl -; AVX1-NEXT: addq %rsi, %rdi +; AVX1-NEXT: movq %rax, %rbx +; AVX1-NEXT: seto %r9b +; AVX1-NEXT: orb %r10b, %r9b +; AVX1-NEXT: addq %rbp, %rbx ; AVX1-NEXT: movq %r11, %rax -; AVX1-NEXT: mulq %r8 -; AVX1-NEXT: addq %rdi, %rdx +; AVX1-NEXT: mulq %r15 +; AVX1-NEXT: addq %rbx, %rdx ; AVX1-NEXT: setb %bl -; AVX1-NEXT: orb %cl, %bl -; AVX1-NEXT: orb %r13b, %bl -; AVX1-NEXT: movzbl %bl, %ecx +; AVX1-NEXT: orb %r9b, %bl +; AVX1-NEXT: orb %r8b, %bl +; AVX1-NEXT: movzbl %bl, %ebp +; AVX1-NEXT: negl %ebp +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: negl %ecx ; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %r10, 16(%r14) -; AVX1-NEXT: movq %rax, (%r14) -; AVX1-NEXT: movq %r15, 24(%r14) -; AVX1-NEXT: movq %rdx, 8(%r14) -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, 16(%r14) +; AVX1-NEXT: movq %rdi, (%r14) +; AVX1-NEXT: movq %rdx, 24(%r14) +; AVX1-NEXT: movq %rsi, 8(%r14) ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 ; AVX1-NEXT: popq %r13 @@ -2471,66 +2375,66 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: movq %rdx, %r12 -; AVX2-NEXT: movq %rdi, %r11 +; AVX2-NEXT: movq %r9, %r10 +; AVX2-NEXT: movq %rcx, %r12 +; AVX2-NEXT: movq %rdx, %r11 +; AVX2-NEXT: movq %rsi, %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; AVX2-NEXT: testq %r10, %r10 -; AVX2-NEXT: setne %dl -; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: setne %cl +; AVX2-NEXT: testq %rsi, %rsi ; AVX2-NEXT: setne %r13b -; AVX2-NEXT: andb %dl, %r13b -; AVX2-NEXT: mulq %r15 -; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: andb %cl, %r13b +; AVX2-NEXT: mulq %r8 +; AVX2-NEXT: movq %rax, %rsi ; AVX2-NEXT: seto %bpl ; AVX2-NEXT: movq %r10, %rax -; AVX2-NEXT: mulq %r12 -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: seto %cl -; AVX2-NEXT: orb %bpl, %cl -; AVX2-NEXT: addq %rdi, %rbx -; AVX2-NEXT: movq %r12, %rax -; AVX2-NEXT: mulq %r15 -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq %rdx, %r15 -; AVX2-NEXT: addq %rbx, %r15 -; AVX2-NEXT: setb %al -; AVX2-NEXT: orb %cl, %al -; AVX2-NEXT: orb %r13b, %al -; AVX2-NEXT: movzbl %al, %ebp +; AVX2-NEXT: mulq %rdi +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: seto %bl +; AVX2-NEXT: orb %bpl, %bl +; AVX2-NEXT: addq %rsi, %rcx +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: mulq %r8 +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: addq %rcx, %rsi +; AVX2-NEXT: setb %cl +; AVX2-NEXT: orb %bl, %cl +; AVX2-NEXT: orb %r13b, %cl ; AVX2-NEXT: testq %r9, %r9 ; AVX2-NEXT: setne %al -; AVX2-NEXT: testq %rsi, %rsi -; AVX2-NEXT: setne %r13b -; AVX2-NEXT: andb %al, %r13b -; AVX2-NEXT: movq %rsi, %rax -; AVX2-NEXT: mulq %r8 -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: seto %r12b +; AVX2-NEXT: testq %r12, %r12 +; AVX2-NEXT: setne %r8b +; AVX2-NEXT: andb %al, %r8b +; AVX2-NEXT: movq %r12, %rax +; AVX2-NEXT: mulq %r15 +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: seto %r10b ; AVX2-NEXT: movq %r9, %rax ; AVX2-NEXT: mulq %r11 -; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: seto %cl -; AVX2-NEXT: orb %r12b, %cl -; AVX2-NEXT: addq %rsi, %rdi +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: seto %r9b +; AVX2-NEXT: orb %r10b, %r9b +; AVX2-NEXT: addq %rbp, %rbx ; AVX2-NEXT: movq %r11, %rax -; AVX2-NEXT: mulq %r8 -; AVX2-NEXT: addq %rdi, %rdx +; AVX2-NEXT: mulq %r15 +; AVX2-NEXT: addq %rbx, %rdx ; AVX2-NEXT: setb %bl -; AVX2-NEXT: orb %cl, %bl -; AVX2-NEXT: orb %r13b, %bl -; AVX2-NEXT: movzbl %bl, %ecx +; AVX2-NEXT: orb %r9b, %bl +; AVX2-NEXT: orb %r8b, %bl +; AVX2-NEXT: movzbl %bl, %ebp +; AVX2-NEXT: negl %ebp +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: negl %ecx ; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %r10, 16(%r14) -; AVX2-NEXT: movq %rax, (%r14) -; AVX2-NEXT: movq %r15, 24(%r14) -; AVX2-NEXT: movq %rdx, 8(%r14) -; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, 16(%r14) +; AVX2-NEXT: movq %rdi, (%r14) +; AVX2-NEXT: movq %rdx, 24(%r14) +; AVX2-NEXT: movq %rsi, 8(%r14) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 ; AVX2-NEXT: popq %r13 @@ -2601,12 +2505,12 @@ ; AVX512-NEXT: andl $1, %esi ; AVX512-NEXT: kmovw %esi, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: movq %r10, 16(%r14) ; AVX512-NEXT: movq %rax, (%r14) ; AVX512-NEXT: movq %r15, 24(%r14) ; AVX512-NEXT: movq %rdx, 8(%r14) -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 Index: llvm/test/CodeGen/X86/vec_usubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_usubo.ll +++ llvm/test/CodeGen/X86/vec_usubo.ll @@ -47,91 +47,66 @@ define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { ; SSE2-LABEL: usubo_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: usubo_v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: psubq %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: usubo_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: psubq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movq %xmm2, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: usubo_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: usubo_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: usubo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpmovqd %xmm0, (%rdi) -; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 @@ -969,6 +944,7 @@ ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -980,6 +956,7 @@ ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -990,6 +967,7 @@ ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -997,9 +975,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpnleuq %xmm0, %xmm1, %k1 -; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -1267,21 +1245,17 @@ ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %r11d +; SSE2-NEXT: sbbl %eax, %eax ; SSE2-NEXT: subq %r8, %rdi ; SSE2-NEXT: sbbq %r9, %rsi -; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: sbbl %eax, %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %rdx, 16(%r10) ; SSE2-NEXT: movq %rdi, (%r10) ; SSE2-NEXT: movq %rcx, 24(%r10) ; SSE2-NEXT: movq %rsi, 8(%r10) -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: usubo_v2i128: @@ -1289,21 +1263,17 @@ ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %r11d +; SSSE3-NEXT: sbbl %eax, %eax ; SSSE3-NEXT: subq %r8, %rdi ; SSSE3-NEXT: sbbq %r9, %rsi -; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: sbbl %eax, %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %rdx, 16(%r10) ; SSSE3-NEXT: movq %rdi, (%r10) ; SSSE3-NEXT: movq %rcx, 24(%r10) ; SSSE3-NEXT: movq %rsi, 8(%r10) -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: usubo_v2i128: @@ -1311,21 +1281,16 @@ ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %r11d +; SSE41-NEXT: sbbl %r11d, %r11d ; SSE41-NEXT: subq %r8, %rdi ; SSE41-NEXT: sbbq %r9, %rsi -; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: sbbl %eax, %eax ; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pinsrb $8, %r11d, %xmm0 +; SSE41-NEXT: pinsrd $1, %r11d, %xmm0 ; SSE41-NEXT: movq %rdx, 16(%r10) ; SSE41-NEXT: movq %rdi, (%r10) ; SSE41-NEXT: movq %rcx, 24(%r10) ; SSE41-NEXT: movq %rsi, 8(%r10) -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: retq ; ; AVX1-LABEL: usubo_v2i128: @@ -1333,21 +1298,16 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX1-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: setb %al -; AVX1-NEXT: movzbl %al, %r11d +; AVX1-NEXT: sbbl %r11d, %r11d ; AVX1-NEXT: subq %r8, %rdi ; AVX1-NEXT: sbbq %r9, %rsi -; AVX1-NEXT: setb %al -; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: sbbl %eax, %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 ; AVX1-NEXT: movq %rdx, 16(%r10) ; AVX1-NEXT: movq %rdi, (%r10) ; AVX1-NEXT: movq %rcx, 24(%r10) ; AVX1-NEXT: movq %rsi, 8(%r10) -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: usubo_v2i128: @@ -1355,21 +1315,16 @@ ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: setb %al -; AVX2-NEXT: movzbl %al, %r11d +; AVX2-NEXT: sbbl %r11d, %r11d ; AVX2-NEXT: subq %r8, %rdi ; AVX2-NEXT: sbbq %r9, %rsi -; AVX2-NEXT: setb %al -; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: sbbl %eax, %eax ; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 ; AVX2-NEXT: movq %rdx, 16(%r10) ; AVX2-NEXT: movq %rdi, (%r10) ; AVX2-NEXT: movq %rcx, 24(%r10) ; AVX2-NEXT: movq %rsi, 8(%r10) -; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: usubo_v2i128: @@ -1386,12 +1341,12 @@ ; AVX512-NEXT: andl $1, %eax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: movq %rdx, 16(%r10) ; AVX512-NEXT: movq %rdi, (%r10) ; AVX512-NEXT: movq %rcx, 24(%r10) ; AVX512-NEXT: movq %rsi, 8(%r10) -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 Index: llvm/test/CodeGen/X86/vector-blend.ll =================================================================== --- llvm/test/CodeGen/X86/vector-blend.ll +++ llvm/test/CodeGen/X86/vector-blend.ll @@ -64,24 +64,30 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ; SSE2-LABEL: vsel_4xi8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_4xi8: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_4xi8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_4xi8: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq entry: %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 @@ -91,26 +97,28 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { ; SSE2-LABEL: vsel_4xi16: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_4xi16: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSSE3-NEXT: andps %xmm2, %xmm0 +; SSSE3-NEXT: andnps %xmm1, %xmm2 +; SSSE3-NEXT: orps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_4xi16: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_4xi16: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX-NEXT: retq entry: %vsel = select <4 x i1> , <4 x i16> %v1, <4 x i16> %v2 Index: llvm/test/CodeGen/X86/vector-ext-logic.ll =================================================================== --- llvm/test/CodeGen/X86/vector-ext-logic.ll +++ llvm/test/CodeGen/X86/vector-ext-logic.ll @@ -140,14 +140,17 @@ define <8 x i16> @zext_and_v8i16(<8 x i8> %x, <8 x i8> %y) { ; SSE2-LABEL: zext_and_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX2-LABEL: zext_and_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vandps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %xz = zext <8 x i8> %x to <8 x i16> %yz = zext <8 x i8> %y to <8 x i16> @@ -158,14 +161,17 @@ define <8 x i16> @zext_or_v8i16(<8 x i8> %x, <8 x i8> %y) { ; SSE2-LABEL: zext_or_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: orps %xmm1, %xmm0 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX2-LABEL: zext_or_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %xz = zext <8 x i8> %x to <8 x i16> %yz = zext <8 x i8> %y to <8 x i16> @@ -176,14 +182,17 @@ define <8 x i16> @zext_xor_v8i16(<8 x i8> %x, <8 x i8> %y) { ; SSE2-LABEL: zext_xor_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm0 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX2-LABEL: zext_xor_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %xz = zext <8 x i8> %x to <8 x i16> %yz = zext <8 x i8> %y to <8 x i16> @@ -194,19 +203,17 @@ define <8 x i16> @sext_and_v8i16(<8 x i8> %x, <8 x i8> %y) { ; SSE2-LABEL: sext_and_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX2-LABEL: sext_and_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %xs = sext <8 x i8> %x to <8 x i16> @@ -218,19 +225,17 @@ define <8 x i16> @sext_or_v8i16(<8 x i8> %x, <8 x i8> %y) { ; SSE2-LABEL: sext_or_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX2-LABEL: sext_or_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %xs = sext <8 x i8> %x to <8 x i16> @@ -242,19 +247,17 @@ define <8 x i16> @sext_xor_v8i16(<8 x i8> %x, <8 x i8> %y) { ; SSE2-LABEL: sext_xor_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX2-LABEL: sext_xor_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %xs = sext <8 x i8> %x to <8 x i16> @@ -295,18 +298,13 @@ define <8 x i32> @bool_zext_or(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_zext_or: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; @@ -325,18 +323,13 @@ define <8 x i32> @bool_zext_xor(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_zext_xor: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-gep.ll =================================================================== --- llvm/test/CodeGen/X86/vector-gep.ll +++ llvm/test/CodeGen/X86/vector-gep.ll @@ -74,8 +74,7 @@ define <4 x i8*> @AGEP5(<4 x i8*> %param, <4 x i8> %off) nounwind { ; CHECK-LABEL: AGEP5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $24, %xmm1, %xmm1 -; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1 +; CHECK-NEXT: vpmovsxbd %xmm1, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retl %A = getelementptr i8, <4 x i8*> %param, <4 x i8> %off Index: llvm/test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- llvm/test/CodeGen/X86/vector-half-conversions.ll +++ llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -24,7 +24,6 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; ALL-LABEL: cvt_4i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: movq %rax, %rcx ; ALL-NEXT: movq %rax, %rdx @@ -932,88 +931,20 @@ } define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { -; AVX1-LABEL: cvt_2i16_to_2f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: cvt_2i16_to_2f64: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovd %xmm0, %eax -; AVX2-SLOW-NEXT: movswl %ax, %ecx -; AVX2-SLOW-NEXT: shrl $16, %eax -; AVX2-SLOW-NEXT: cwtl -; AVX2-SLOW-NEXT: vmovd %eax, %xmm0 -; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1 -; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: cvt_2i16_to_2f64: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vmovd %xmm0, %eax -; AVX2-FAST-NEXT: movswl %ax, %ecx -; AVX2-FAST-NEXT: shrl $16, %eax -; AVX2-FAST-NEXT: cwtl -; AVX2-FAST-NEXT: vmovd %eax, %xmm0 -; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovd %ecx, %xmm1 -; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: cvt_2i16_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: movswl %ax, %ecx -; AVX512F-NEXT: shrl $16, %eax -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_2i16_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512VL-NEXT: movswl %ax, %ecx -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %ecx, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_2i16_to_2f64: +; ALL: # %bb.0: +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movswl %ax, %ecx +; ALL-NEXT: shrl $16, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: retq %1 = bitcast <2 x i16> %a0 to <2 x half> %2 = fpext <2 x half> %1 to <2 x double> ret <2 x double> %2 @@ -1022,7 +953,6 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; ALL-LABEL: cvt_4i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: movq %rax, %rcx ; ALL-NEXT: movl %eax, %edx @@ -1055,67 +985,20 @@ } define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { -; AVX1-LABEL: cvt_8i16_to_2f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_8i16_to_2f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: movswl %ax, %ecx -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_8i16_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: movswl %ax, %ecx -; AVX512F-NEXT: shrl $16, %eax -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_8i16_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512VL-NEXT: movswl %ax, %ecx -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %ecx, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_8i16_to_2f64: +; ALL: # %bb.0: +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movswl %ax, %ecx +; ALL-NEXT: shrl $16, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = bitcast <2 x i16> %1 to <2 x half> %3 = fpext <2 x half> %2 to <2 x double> @@ -1594,7 +1477,7 @@ ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1619,7 +1502,7 @@ ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1972,7 +1855,7 @@ ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: vmovaps %xmm0, (%rdi) ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> @@ -2279,12 +2162,12 @@ ; ALL-NEXT: subq $40, %rsp ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; ALL-NEXT: vmovaps (%rsp), %xmm0 ; ALL-NEXT: addq $40, %rsp ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> @@ -2306,7 +2189,7 @@ ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 @@ -2315,7 +2198,7 @@ ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; ALL-NEXT: vmovaps (%rsp), %xmm0 ; ALL-NEXT: addq $88, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -2337,7 +2220,7 @@ ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 @@ -2346,7 +2229,7 @@ ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovaps (%rsp), %xmm0 ; ALL-NEXT: addq $88, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -2369,7 +2252,7 @@ ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 @@ -2775,7 +2658,7 @@ ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 @@ -2784,7 +2667,7 @@ ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovaps (%rsp), %xmm0 ; ALL-NEXT: vmovaps %xmm0, (%rbx) ; ALL-NEXT: addq $80, %rsp ; ALL-NEXT: popq %rbx @@ -2812,7 +2695,7 @@ ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 Index: llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -349,16 +349,18 @@ ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 @@ -859,16 +861,18 @@ ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 @@ -881,14 +885,16 @@ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq Index: llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -248,16 +248,18 @@ ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 @@ -279,16 +281,18 @@ ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7] +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 @@ -679,16 +683,18 @@ ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 @@ -697,35 +703,39 @@ ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm7, %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7] -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm6, %xmm6 @@ -738,14 +748,16 @@ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq Index: llvm/test/CodeGen/X86/vector-idiv-v2i32.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -8,58 +8,40 @@ ; X64-LABEL: test_udiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: subl %ecx, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: shrl $2, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: subl %edx, %ecx -; X64-NEXT: shrl %ecx -; X64-NEXT: addl %edx, %ecx -; X64-NEXT: shrl $2, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmuludq %xmm1, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm1, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: psubd %xmm2, %xmm0 +; X64-NEXT: psrld $1, %xmm0 +; X64-NEXT: paddd %xmm2, %xmm0 +; X64-NEXT: psrld $2, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_udiv7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X86-NEXT: movd %xmm0, %esi -; X86-NEXT: movl $613566757, %ebx # imm = 0x24924925 -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: subl %edx, %esi -; X86-NEXT: shrl %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: shrl $2, %esi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: subl %edx, %ecx -; X86-NEXT: shrl %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: shrl $2, %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-NEXT: psubd %xmm2, %xmm0 +; X86-NEXT: psrld $1, %xmm0 +; X86-NEXT: paddd %xmm2, %xmm0 +; X86-NEXT: psrld $2, %xmm0 +; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_udiv7_v2i32: @@ -110,76 +92,50 @@ ; X64-LABEL: test_urem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: subl %edx, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: addl %edx, %eax -; X64-NEXT: shrl $2, %eax -; X64-NEXT: leal (,%rax,8), %edx -; X64-NEXT: subl %edx, %eax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movl %ecx, %edi -; X64-NEXT: subl %edx, %edi -; X64-NEXT: shrl %edi -; X64-NEXT: addl %edx, %edi -; X64-NEXT: shrl $2, %edi -; X64-NEXT: leal (,%rdi,8), %edx -; X64-NEXT: subl %edx, %edi -; X64-NEXT: addl %ecx, %edi -; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmuludq %xmm1, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm1, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psubd %xmm2, %xmm1 +; X64-NEXT: psrld $1, %xmm1 +; X64-NEXT: paddd %xmm2, %xmm1 +; X64-NEXT: psrld $2, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pslld $3, %xmm2 +; X64-NEXT: psubd %xmm2, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_urem7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X86-NEXT: movd %xmm0, %esi -; X86-NEXT: movl $613566757, %edi # imm = 0x24924925 -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: subl %edx, %ebx -; X86-NEXT: shrl %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: shrl $2, %ebx -; X86-NEXT: leal (,%ebx,8), %eax -; X86-NEXT: subl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: shrl %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: shrl $2, %eax -; X86-NEXT: leal (,%eax,8), %edx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %ebx, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%ebp) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psubd %xmm2, %xmm1 +; X86-NEXT: psrld $1, %xmm1 +; X86-NEXT: paddd %xmm2, %xmm1 +; X86-NEXT: psrld $2, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pslld $3, %xmm2 +; X86-NEXT: psubd %xmm2, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_urem7_v2i32: @@ -240,67 +196,52 @@ ; X64-LABEL: test_sdiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: cltq -; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $31, %ecx -; X64-NEXT: sarl $2, %eax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: addl %edx, %ecx -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: shrl $31, %edx -; X64-NEXT: sarl $2, %ecx -; X64-NEXT: addl %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmuludq %xmm1, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm1, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm0, %xmm3 +; X64-NEXT: pand %xmm1, %xmm3 +; X64-NEXT: paddd %xmm0, %xmm3 +; X64-NEXT: psubd %xmm3, %xmm2 +; X64-NEXT: paddd %xmm0, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: psrld $31, %xmm0 +; X64-NEXT: psrad $2, %xmm2 +; X64-NEXT: paddd %xmm0, %xmm2 +; X64-NEXT: movq %xmm2, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X86-NEXT: movd %xmm0, %esi -; X86-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: sarl $2, %edi -; X86-NEXT: addl %eax, %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: imull %ebp -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: sarl $2, %edx -; X86-NEXT: addl %eax, %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %edi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%ebx) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: pxor %xmm3, %xmm3 +; X86-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-NEXT: pand %xmm1, %xmm3 +; X86-NEXT: paddd %xmm0, %xmm3 +; X86-NEXT: psubd %xmm3, %xmm2 +; X86-NEXT: paddd %xmm0, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: psrld $31, %xmm0 +; X86-NEXT: psrad $2, %xmm2 +; X86-NEXT: paddd %xmm0, %xmm2 +; X86-NEXT: movq %xmm2, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_sdiv7_v2i32: @@ -363,79 +304,60 @@ ; X64-LABEL: test_srem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: movslq %eax, %rcx -; X64-NEXT: imulq $-1840700269, %rcx, %rax # imm = 0x92492493 -; X64-NEXT: shrq $32, %rax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shrl $31, %edx -; X64-NEXT: sarl $2, %eax -; X64-NEXT: addl %edx, %eax -; X64-NEXT: leal (,%rax,8), %edx -; X64-NEXT: subl %edx, %eax -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: addl %ecx, %edx -; X64-NEXT: movl %edx, %edi -; X64-NEXT: shrl $31, %edi -; X64-NEXT: sarl $2, %edx -; X64-NEXT: addl %edi, %edx -; X64-NEXT: leal (,%rdx,8), %edi -; X64-NEXT: subl %edi, %edx -; X64-NEXT: addl %ecx, %edx -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pmuludq %xmm1, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm1, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm0, %xmm3 +; X64-NEXT: pand %xmm1, %xmm3 +; X64-NEXT: paddd %xmm0, %xmm3 +; X64-NEXT: psubd %xmm3, %xmm2 +; X64-NEXT: paddd %xmm0, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: psrld $31, %xmm1 +; X64-NEXT: psrad $2, %xmm2 +; X64-NEXT: paddd %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: pslld $3, %xmm1 +; X64-NEXT: psubd %xmm1, %xmm2 +; X64-NEXT: paddd %xmm0, %xmm2 +; X64-NEXT: movq %xmm2, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_srem7_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] -; X86-NEXT: movd %xmm0, %esi -; X86-NEXT: movl $-1840700269, %ebx # imm = 0x92492493 -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: sarl $2, %edi -; X86-NEXT: addl %eax, %edi -; X86-NEXT: leal (,%edi,8), %eax -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: imull %ebx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: sarl $2, %edx -; X86-NEXT: addl %eax, %edx -; X86-NEXT: leal (,%edx,8), %eax -; X86-NEXT: subl %eax, %edx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %edi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%ebp) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: pxor %xmm3, %xmm3 +; X86-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-NEXT: pand %xmm1, %xmm3 +; X86-NEXT: paddd %xmm0, %xmm3 +; X86-NEXT: psubd %xmm3, %xmm2 +; X86-NEXT: paddd %xmm0, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: psrld $31, %xmm1 +; X86-NEXT: psrad $2, %xmm2 +; X86-NEXT: paddd %xmm1, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: pslld $3, %xmm1 +; X86-NEXT: psubd %xmm1, %xmm2 +; X86-NEXT: paddd %xmm0, %xmm2 +; X86-NEXT: movq %xmm2, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_srem7_v2i32: @@ -506,10 +428,7 @@ ; X64-LABEL: test_udiv_pow2_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: psrlq $3, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: psrld $3, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq ; @@ -518,10 +437,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: psrlq $3, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: psrld $3, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; @@ -549,14 +465,9 @@ define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X64-LABEL: test_urem_pow2_v2i32: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl 4(%rdi), %ecx -; X64-NEXT: movq %rcx, %xmm0 -; X64-NEXT: movq %rax, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-NEXT: pand {{.*}}(%rip), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: andps {{.*}}(%rip), %xmm0 +; X64-NEXT: movlps %xmm0, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_urem_pow2_v2i32: @@ -564,10 +475,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-NEXT: andps {{\.LCPI.*}}, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_urem_pow2_v2i32: @@ -595,23 +504,12 @@ ; X64-LABEL: test_sdiv_pow2_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,1] +; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psrad $31, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: psrlq $31, %xmm0 -; X64-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-NEXT: psrlq $29, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X64-NEXT: psrad $31, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: psrlq $3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rsi) +; X64-NEXT: psrld $29, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: psrad $3, %xmm1 +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv_pow2_v2i32: @@ -619,28 +517,12 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $31, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: psrlq $31, %xmm2 -; X86-NEXT: movsd {{.*#+}} xmm2 = xmm2[0,1] -; X86-NEXT: movapd {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314] -; X86-NEXT: xorpd %xmm1, %xmm2 -; X86-NEXT: psubq %xmm1, %xmm2 -; X86-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X86-NEXT: psrlq $29, %xmm2 -; X86-NEXT: paddq %xmm0, %xmm2 -; X86-NEXT: psllq $32, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] -; X86-NEXT: psrad $31, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: psrlq $3, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: psrld $29, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: psrad $3, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_sdiv_pow2_v2i32: @@ -676,10 +558,7 @@ ; X64-LABEL: test_srem_pow2_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: psrlq $3, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: psrld $3, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq ; @@ -688,10 +567,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: psrlq $3, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: psrld $3, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; @@ -722,52 +598,45 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %esi -; X64-NEXT: movl %eax, %esi +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %edi +; X64-NEXT: divl %esi ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_udiv_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ebx +; X86-NEXT: divl %esi ; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-NEXT: movq %xmm2, (%ecx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_udiv_v2i32: @@ -828,52 +697,45 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %esi -; X64-NEXT: movl %edx, %esi +; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %edi +; X64-NEXT: divl %esi ; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_urem_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movd %edx, %xmm2 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ebx +; X86-NEXT: divl %esi ; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-NEXT: movq %xmm2, (%ecx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64_WIDEN-LABEL: test_urem_v2i32: @@ -934,21 +796,20 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: cltd ; X64-NEXT: idivl %esi -; X64-NEXT: movl %eax, %esi +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: cltd -; X64-NEXT: idivl %edi +; X64-NEXT: idivl %esi ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv_v2i32: @@ -956,27 +817,26 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: movd %xmm1, %edi +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] -; X86-NEXT: movd %xmm1, %esi -; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd ; X86-NEXT: idivl %ebx ; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cltd +; X86-NEXT: idivl %edi +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1045,21 +905,20 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1] -; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %esi ; X64-NEXT: cltd ; X64-NEXT: idivl %esi -; X64-NEXT: movl %eax, %esi +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: cltd -; X64-NEXT: idivl %edi +; X64-NEXT: idivl %esi ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, (%rcx) ; X64-NEXT: retq ; ; X86-LABEL: test_srem_v2i32: @@ -1067,27 +926,26 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: movd %xmm1, %edi +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; X86-NEXT: movd %xmm1, %ebx -; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1] -; X86-NEXT: movd %xmm1, %esi -; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd ; X86-NEXT: idivl %ebx ; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: movd %esi, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: movq %xmm0, (%edi) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cltd +; X86-NEXT: idivl %edi +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/vector-narrow-binop.ll =================================================================== --- llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -107,34 +107,20 @@ ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: PR39893: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR39893: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: retq -; -; AVX512-LABEL: PR39893: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512-NEXT: retq +; AVX-LABEL: PR39893: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: retq %sub = sub <2 x i32> , %x %bc = bitcast <2 x i32> %sub to <8 x i8> %shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32> @@ -146,16 +132,13 @@ ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: PR39893_2: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %fsub = fsub <2 x float> zeroinitializer, %x %bc = bitcast <2 x float> %fsub to <8 x i8> Index: llvm/test/CodeGen/X86/vector-reduce-add.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-add.ll +++ llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -198,22 +198,35 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE-LABEL: test_v2i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v2i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v2i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0) @@ -499,24 +512,40 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE-LABEL: test_v2i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v2i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v2i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -527,49 +556,50 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; SSE-LABEL: test_v4i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4i16: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v4i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -920,32 +950,34 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -956,59 +988,44 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-SLOW-LABEL: test_v4i8: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $al killed $al killed $eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v4i8: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $al killed $al killed $eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1019,73 +1036,52 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-SLOW-LABEL: test_v8i8: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $al killed $al killed $eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v8i8: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $al killed $al killed $eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-and-bool.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -118,17 +118,29 @@ } define i1 @trunc_v8i16_v8i1(<8 x i8>) { -; SSE-LABEL: trunc_v8i16_v8i1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $15, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpb $-1, %al -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: trunc_v8i16_v8i1: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpb $-1, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: trunc_v8i16_v8i1: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX-LABEL: trunc_v8i16_v8i1: ; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax @@ -138,9 +150,9 @@ ; ; AVX512F-LABEL: trunc_v8i16_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: cmpb $-1, %al ; AVX512F-NEXT: sete %al @@ -149,8 +161,8 @@ ; ; AVX512BW-LABEL: trunc_v8i16_v8i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: cmpb $-1, %al ; AVX512BW-NEXT: sete %al @@ -159,8 +171,8 @@ ; ; AVX512VL-LABEL: trunc_v8i16_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovw2m %xmm0, %k0 +; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: cmpb $-1, %al ; AVX512VL-NEXT: sete %al @@ -1058,22 +1070,33 @@ } define i1 @icmp_v8i16_v8i1(<8 x i8>) { -; SSE-LABEL: icmp_v8i16_v8i1: -; SSE: # %bb.0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: packsswb %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: cmpb $-1, %al -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: icmp_v8i16_v8i1: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpb $-1, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: icmp_v8i16_v8i1: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX-LABEL: icmp_v8i16_v8i1: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax ; AVX-NEXT: cmpb $-1, %al @@ -1082,11 +1105,10 @@ ; ; AVX512F-LABEL: icmp_v8i16_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: cmpb $-1, %al ; AVX512F-NEXT: sete %al @@ -1096,8 +1118,7 @@ ; AVX512BW-LABEL: icmp_v8i16_v8i1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: cmpb $-1, %al ; AVX512BW-NEXT: sete %al @@ -1106,7 +1127,7 @@ ; ; AVX512VL-LABEL: icmp_v8i16_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0 +; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: cmpb $-1, %al ; AVX512VL-NEXT: sete %al Index: llvm/test/CodeGen/X86/vector-reduce-and.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-and.ll +++ llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -182,14 +182,14 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE-LABEL: test_v2i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -397,7 +397,8 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE-LABEL: test_v2i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax @@ -405,7 +406,7 @@ ; ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax @@ -417,9 +418,10 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; SSE-LABEL: test_v4i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax @@ -427,10 +429,10 @@ ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -684,7 +686,8 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -692,7 +695,8 @@ ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -700,7 +704,7 @@ ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -712,9 +716,11 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -722,9 +728,11 @@ ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -732,9 +740,9 @@ ; ; AVX-LABEL: test_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -746,12 +754,13 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -759,12 +768,13 @@ ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -772,12 +782,12 @@ ; ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -776,24 +776,31 @@ ; define i32 @test_v2i32(<2 x i32> %a0) { -; SSE-LABEL: test_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq +; SSE2-LABEL: test_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0) @@ -1119,24 +1126,25 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE-LABEL: test_v2i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pmullw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1145,48 +1153,33 @@ } define i16 @test_v4i16(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1563,32 +1556,34 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmuludq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1599,46 +1594,56 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1649,12 +1654,16 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0] ; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -1662,12 +1671,19 @@ ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1675,11 +1691,18 @@ ; ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -1687,11 +1710,18 @@ ; ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax @@ -1789,18 +1819,24 @@ ; ; AVX2-LABEL: test_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -1989,12 +2025,14 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -2023,25 +2061,32 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax @@ -2111,25 +2156,32 @@ ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax @@ -2276,17 +2328,23 @@ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmullw %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2325,25 +2383,32 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax @@ -2368,25 +2433,32 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax @@ -2453,25 +2525,32 @@ ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax @@ -2668,8 +2747,8 @@ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm8 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm9 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 @@ -2681,30 +2760,40 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] ; AVX1-NEXT: vpmullw %xmm10, %xmm6, %xmm6 -; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm9 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX1-NEXT: vpmullw %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm4 -; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpmullw %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2751,25 +2840,32 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax @@ -2801,25 +2897,32 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax @@ -2901,25 +3004,32 @@ ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax Index: llvm/test/CodeGen/X86/vector-reduce-or-bool.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -112,17 +112,29 @@ } define i1 @trunc_v8i16_v8i1(<8 x i8>) { -; SSE-LABEL: trunc_v8i16_v8i1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $15, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: testb %al, %al -; SSE-NEXT: setne %al -; SSE-NEXT: retq +; SSE2-LABEL: trunc_v8i16_v8i1: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: setne %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: trunc_v8i16_v8i1: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testb %al, %al +; SSE41-NEXT: setne %al +; SSE41-NEXT: retq ; ; AVX-LABEL: trunc_v8i16_v8i1: ; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax @@ -132,9 +144,9 @@ ; ; AVX512F-LABEL: trunc_v8i16_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: setne %al @@ -143,8 +155,8 @@ ; ; AVX512BW-LABEL: trunc_v8i16_v8i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al ; AVX512BW-NEXT: setne %al @@ -153,8 +165,8 @@ ; ; AVX512VL-LABEL: trunc_v8i16_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovw2m %xmm0, %k0 +; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al @@ -1043,22 +1055,33 @@ } define i1 @icmp_v8i16_v8i1(<8 x i8>) { -; SSE-LABEL: icmp_v8i16_v8i1: -; SSE: # %bb.0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: packsswb %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: testb %al, %al -; SSE-NEXT: setne %al -; SSE-NEXT: retq +; SSE2-LABEL: icmp_v8i16_v8i1: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: setne %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: icmp_v8i16_v8i1: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testb %al, %al +; SSE41-NEXT: setne %al +; SSE41-NEXT: retq ; ; AVX-LABEL: icmp_v8i16_v8i1: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax ; AVX-NEXT: testb %al, %al @@ -1067,11 +1090,10 @@ ; ; AVX512F-LABEL: icmp_v8i16_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: setne %al @@ -1081,8 +1103,7 @@ ; AVX512BW-LABEL: icmp_v8i16_v8i1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al ; AVX512BW-NEXT: setne %al @@ -1091,7 +1112,7 @@ ; ; AVX512VL-LABEL: icmp_v8i16_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0 +; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al Index: llvm/test/CodeGen/X86/vector-reduce-or.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-or.ll +++ llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -182,14 +182,14 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE-LABEL: test_v2i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -397,7 +397,8 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE-LABEL: test_v2i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax @@ -405,7 +406,7 @@ ; ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax @@ -417,9 +418,10 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; SSE-LABEL: test_v4i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax @@ -427,10 +429,10 @@ ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -684,7 +686,8 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -692,7 +695,8 @@ ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: por %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -700,7 +704,7 @@ ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -712,9 +716,11 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -722,9 +728,11 @@ ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -732,9 +740,9 @@ ; ; AVX-LABEL: test_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -746,12 +754,13 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -759,12 +768,13 @@ ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: por %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -772,12 +782,12 @@ ; ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-smax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -685,109 +685,35 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -1130,200 +1056,62 @@ ; define i16 @test_v2i16(<2 x i16> %a0) { -; SSE2-LABEL: test_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: psllq $48, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: psllq $48, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsraq $48, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $48, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pmaxsw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsraq $48, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0) ret i16 %1 } define i16 @test_v4i16(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pslld $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pslld $16, %xmm1 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmaxsd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pmaxsw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pmaxsw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1611,133 +1399,41 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: psllq $56, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: psllq $56, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: psllq $56, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: psrad $24, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsraq $56, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $56, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsraq $56, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1745,19 +1441,17 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pslld $24, %xmm1 -; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 @@ -1767,42 +1461,32 @@ ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pslld $24, %xmm1 -; SSE41-NEXT: psrad $24, %xmm1 -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmaxsd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $24, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $24, %xmm0, %xmm1 -; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1813,82 +1497,64 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: pmaxsw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: pmaxsw %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: pmaxsw %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-smin.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -684,109 +684,35 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsraq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsraq $32, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -1129,200 +1055,62 @@ ; define i16 @test_v2i16(<2 x i16> %a0) { -; SSE2-LABEL: test_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: psllq $48, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: psllq $48, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsraq $48, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $48, %zmm0, %zmm0 -; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pminsw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsraq $48, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0) ret i16 %1 } define i16 @test_v4i16(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pslld $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pslld $16, %xmm1 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pminsd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pminsw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pminsw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512-NEXT: vpminsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1610,133 +1398,41 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: psllq $56, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: psllq $56, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: psrad $24, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminsb %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX512BW-NEXT: vpsraq $56, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraq $56, %zmm0, %zmm0 -; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsraq $56, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1744,19 +1440,17 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pslld $24, %xmm1 -; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 @@ -1766,42 +1460,32 @@ ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pslld $24, %xmm1 -; SSE41-NEXT: psrad $24, %xmm1 -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pminsd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pminsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminsb %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $24, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $24, %xmm0, %xmm1 -; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512-NEXT: vpminsd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1812,82 +1496,64 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: pminsw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: pminsw %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pminsb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: pminsw %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminsb %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX-NEXT: vpminsw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpminsw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -742,87 +742,38 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movd %xmm2, %eax +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -1224,85 +1175,41 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movd %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1310,70 +1217,48 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; SSE2-LABEL: test_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7] -; SSE41-NEXT: pmaxud %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmaxuw %xmm1, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] -; AVX-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] -; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1777,83 +1662,37 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: por %xmm3, %xmm4 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1861,83 +1700,44 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxud %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmaxub %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1948,92 +1748,52 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pmaxsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxuw %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxuw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pmaxub %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -741,87 +741,38 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movd %xmm2, %eax +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 -; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpminuq %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -1223,85 +1174,41 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movd %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1309,70 +1216,48 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; SSE2-LABEL: test_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7] -; SSE41-NEXT: pminud %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pminuw %xmm1, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] -; AVX-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] -; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1683,83 +1568,37 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1767,83 +1606,44 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1854,92 +1654,52 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pminsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminuw %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminuw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -118,17 +118,29 @@ } define i1 @trunc_v8i16_v8i1(<8 x i8>) { -; SSE-LABEL: trunc_v8i16_v8i1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $15, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: xorb $0, %al -; SSE-NEXT: setnp %al -; SSE-NEXT: retq +; SSE2-LABEL: trunc_v8i16_v8i1: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: xorb $0, %al +; SSE2-NEXT: setnp %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: trunc_v8i16_v8i1: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: xorb $0, %al +; SSE41-NEXT: setnp %al +; SSE41-NEXT: retq ; ; AVX-LABEL: trunc_v8i16_v8i1: ; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax @@ -138,9 +150,9 @@ ; ; AVX512F-LABEL: trunc_v8i16_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al @@ -149,8 +161,8 @@ ; ; AVX512BW-LABEL: trunc_v8i16_v8i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al @@ -159,8 +171,8 @@ ; ; AVX512VL-LABEL: trunc_v8i16_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovw2m %xmm0, %k0 +; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al @@ -1158,22 +1170,33 @@ } define i1 @icmp_v8i16_v8i1(<8 x i8>) { -; SSE-LABEL: icmp_v8i16_v8i1: -; SSE: # %bb.0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: packsswb %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: xorb $0, %al -; SSE-NEXT: setnp %al -; SSE-NEXT: retq +; SSE2-LABEL: icmp_v8i16_v8i1: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: xorb $0, %al +; SSE2-NEXT: setnp %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: icmp_v8i16_v8i1: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: xorb $0, %al +; SSE41-NEXT: setnp %al +; SSE41-NEXT: retq ; ; AVX-LABEL: icmp_v8i16_v8i1: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax ; AVX-NEXT: xorb $0, %al @@ -1182,11 +1205,10 @@ ; ; AVX512F-LABEL: icmp_v8i16_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al @@ -1196,8 +1218,7 @@ ; AVX512BW-LABEL: icmp_v8i16_v8i1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al @@ -1206,7 +1227,7 @@ ; ; AVX512VL-LABEL: icmp_v8i16_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0 +; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al Index: llvm/test/CodeGen/X86/vector-reduce-xor.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -182,14 +182,14 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE-LABEL: test_v2i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -397,7 +397,8 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE-LABEL: test_v2i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax @@ -405,7 +406,7 @@ ; ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax @@ -417,9 +418,10 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; SSE-LABEL: test_v4i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax @@ -427,10 +429,10 @@ ; ; AVX-LABEL: test_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -684,7 +686,8 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -692,7 +695,8 @@ ; ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -700,7 +704,7 @@ ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -712,9 +716,11 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -722,9 +728,11 @@ ; ; SSE41-LABEL: test_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -732,9 +740,9 @@ ; ; AVX-LABEL: test_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -746,12 +754,13 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-LABEL: test_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -759,12 +768,13 @@ ; ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -772,12 +782,12 @@ ; ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq Index: llvm/test/CodeGen/X86/vector-sext.ll =================================================================== --- llvm/test/CodeGen/X86/vector-sext.ll +++ llvm/test/CodeGen/X86/vector-sext.ll @@ -3065,23 +3065,15 @@ ; ; X32-SSE2-LABEL: sext_2i8_to_i32: ; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: pushl %eax -; X32-SSE2-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: psraw $8, %xmm0 ; X32-SSE2-NEXT: movd %xmm0, %eax -; X32-SSE2-NEXT: popl %ecx -; X32-SSE2-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: sext_2i8_to_i32: ; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pushl %eax -; X32-SSE41-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; X32-SSE41-NEXT: movd %xmm0, %eax -; X32-SSE41-NEXT: popl %ecx -; X32-SSE41-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE41-NEXT: retl entry: %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> @@ -3180,84 +3172,71 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSE2-LABEL: sext_4i8_to_4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i8_to_4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pslld $24, %xmm0 -; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_4i8_to_4i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: sext_4i8_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: sext_4i8_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: sext_4i8_to_4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-SSE2-LABEL: sext_4i8_to_4i64: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pslld $24, %xmm0 -; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 ; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pxor %xmm3, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: sext_4i8_to_4i64: ; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pslld $24, %xmm0 -; X32-SSE41-NEXT: psrad $24, %xmm0 -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: psrld $16, %xmm0 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl %extmask = sext <4 x i8> %mask to <4 x i64> @@ -3347,8 +3326,7 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_2i8_to_2i32: @@ -3358,20 +3336,23 @@ ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X32-SSE2-LABEL: sext_2i8_to_2i32: @@ -3382,15 +3363,16 @@ ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-SSE2-NEXT: paddq %xmm0, %xmm0 +; X32-SSE2-NEXT: paddd %xmm0, %xmm0 ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: sext_2i8_to_2i32: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 -; X32-SSE41-NEXT: paddq %xmm0, %xmm0 +; X32-SSE41-NEXT: movzwl (%eax), %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; X32-SSE41-NEXT: paddd %xmm0, %xmm0 ; X32-SSE41-NEXT: retl %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = sext <2 x i8> %x to <2 x i32> @@ -3816,8 +3798,8 @@ define <8 x i32> @zext_negate_sext(<8 x i8> %x) { ; SSE2-LABEL: zext_negate_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: psubw %xmm0, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: psrad $16, %xmm0 @@ -3827,8 +3809,8 @@ ; ; SSSE3-LABEL: zext_negate_sext: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSSE3-NEXT: psubw %xmm0, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: psrad $16, %xmm0 @@ -3838,7 +3820,7 @@ ; ; SSE41-LABEL: zext_negate_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: psubw %xmm0, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 @@ -3848,35 +3830,33 @@ ; ; AVX1-LABEL: zext_negate_sext: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_negate_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_negate_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq ; ; X32-SSE2-LABEL: zext_negate_sext: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X32-SSE2-NEXT: psubw %xmm0, %xmm1 ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X32-SSE2-NEXT: psrad $16, %xmm0 @@ -3886,7 +3866,7 @@ ; ; X32-SSE41-LABEL: zext_negate_sext: ; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X32-SSE41-NEXT: pxor %xmm1, %xmm1 ; X32-SSE41-NEXT: psubw %xmm0, %xmm1 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 @@ -3902,7 +3882,8 @@ define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) { ; SSE2-LABEL: zext_decremenet_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3913,7 +3894,8 @@ ; ; SSSE3-LABEL: zext_decremenet_sext: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: paddw %xmm0, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3924,7 +3906,7 @@ ; ; SSE41-LABEL: zext_decremenet_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 @@ -3934,35 +3916,33 @@ ; ; AVX1-LABEL: zext_decremenet_sext: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_decremenet_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_decremenet_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-SSE2-LABEL: zext_decremenet_sext: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-SSE2-NEXT: paddw %xmm0, %xmm1 ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3973,7 +3953,7 @@ ; ; X32-SSE41-LABEL: zext_decremenet_sext: ; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-SSE41-NEXT: paddw %xmm0, %xmm1 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 Index: llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -20,156 +20,6 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: var_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: xorpd %xmm0, %xmm2 -; SSE2-NEXT: psubq %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: psrlq %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psrlq %xmm0, %xmm4 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: psubq %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $32, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X32-SSE-NEXT: psrad $31, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm3, %xmm0 -; X32-SSE-NEXT: psrlq %xmm1, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; X32-SSE-NEXT: xorps %xmm5, %xmm5 -; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; X32-SSE-NEXT: psrlq %xmm5, %xmm3 -; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; X32-SSE-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE-NEXT: psrlq %xmm5, %xmm0 -; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm3, %xmm0 -; X32-SSE-NEXT: psubq %xmm3, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i32> %a, %b - ret <2 x i32> %shift -} - -define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrad %xmm2, %xmm3 @@ -188,100 +38,70 @@ ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v4i16: +; SSE41-LABEL: var_shift_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pslld $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrad %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v4i16: +; AVX1-LABEL: var_shift_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v4i16: +; AVX2-LABEL: var_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v4i16: +; XOPAVX1-LABEL: var_shift_v2i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v4i16: +; XOPAVX2-LABEL: var_shift_v2i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v4i16: +; AVX512-LABEL: var_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v4i16: +; AVX512VL-LABEL: var_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; -; X32-SSE-LABEL: var_shift_v4i16: +; X32-SSE-LABEL: var_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $16, %xmm0 -; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psrad %xmm2, %xmm3 @@ -299,167 +119,21 @@ ; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; X32-SSE-NEXT: movaps %xmm2, %xmm0 ; X32-SSE-NEXT: retl - %shift = ashr <4 x i16> %a, %b - ret <4 x i16> %shift -} - -define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: xorpd %xmm2, %xmm0 -; SSE2-NEXT: psubq %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: psrlq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psrlq %xmm2, %xmm4 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: psubq %xmm3, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOP-LABEL: var_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOP-NEXT: vpsllq $48, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $48, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad $31, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm4, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: psrlq %xmm4, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm2, %xmm0 -; X32-SSE-NEXT: psubq %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i16> %a, %b - ret <2 x i16> %shift + %shift = ashr <2 x i32> %a, %b + ret <2 x i32> %shift } -define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v8i8: +define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psraw $15, %xmm2 @@ -485,66 +159,57 @@ ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v8i8: +; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $8, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 ; SSE41-NEXT: psllw $4, %xmm2 ; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: paddw %xmm2, %xmm4 -; SSE41-NEXT: psraw $15, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psraw $8, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm4, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm4, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v8i8: +; AVX1-LABEL: var_shift_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v8i8: +; AVX2-LABEL: var_shift_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 @@ -553,21 +218,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOP-LABEL: var_shift_v8i8: +; XOP-LABEL: var_shift_v4i16: ; XOP: # %bb.0: -; XOP-NEXT: vpsllw $8, %xmm0, %xmm0 -; XOP-NEXT: vpsraw $8, %xmm0, %xmm0 -; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ-LABEL: var_shift_v4i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 @@ -576,21 +235,17 @@ ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW-LABEL: var_shift_v4i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; -; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL-LABEL: var_shift_v4i16: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 @@ -598,27 +253,21 @@ ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; -; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL-LABEL: var_shift_v4i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; -; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE-LABEL: var_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psllw $8, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: psraw $8, %xmm3 ; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: psraw $15, %xmm0 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 ; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: pandn %xmm3, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 ; X32-SSE-NEXT: paddw %xmm1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2 ; X32-SSE-NEXT: psraw $15, %xmm2 @@ -643,138 +292,634 @@ ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: por %xmm2, %xmm0 ; X32-SSE-NEXT: retl - %shift = ashr <8 x i8> %a, %b - ret <8 x i8> %shift + %shift = ashr <4 x i16> %a, %b + ret <4 x i16> %shift } -define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v4i8: +define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v4i8: +; SSE41-LABEL: var_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm1, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psraw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v4i8: +; AVX2-LABEL: var_shift_v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: var_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: var_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: var_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: var_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $12, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: psraw $15, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i16> %a, %b + ret <2 x i16> %shift +} + +define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $2, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm4 +; X32-SSE-NEXT: pandn %xmm2, %xmm4 +; X32-SSE-NEXT: psraw $1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <8 x i8> %a, %b + ret <8 x i8> %shift +} + +define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $24, %xmm0 -; X32-SSE-NEXT: psrad $24, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrad %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad %xmm4, %xmm2 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrad %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrad %xmm1, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $2, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm4 +; X32-SSE-NEXT: pandn %xmm2, %xmm4 +; X32-SSE-NEXT: psraw $1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i8> %a, %b ret <4 x i8> %shift @@ -783,136 +928,226 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-LABEL: var_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: xorpd %xmm2, %xmm0 -; SSE2-NEXT: psubq %xmm2, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: psllq $56, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm4, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlq %xmm4, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubq %xmm2, %xmm0 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: var_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; XOP-LABEL: var_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $56, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: var_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: var_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $56, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad $31, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X32-SSE-NEXT: psrad $24, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm4, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: psrlq %xmm4, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm2, %xmm0 -; X32-SSE-NEXT: psubq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $2, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm4 +; X32-SSE-NEXT: pandn %xmm2, %xmm4 +; X32-SSE-NEXT: psraw $1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i8> %a, %b ret <2 x i8> %shift @@ -925,153 +1160,46 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: xorpd %xmm1, %xmm2 -; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: psrad %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlq %xmm0, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psrlq %xmm1, %xmm4 -; SSE41-NEXT: psrlq %xmm0, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: psubq %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrad %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $32, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X32-SSE-NEXT: psrad $31, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0] -; X32-SSE-NEXT: pand %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE-NEXT: psrlq %xmm3, %xmm0 -; X32-SSE-NEXT: xorps %xmm5, %xmm5 -; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] -; X32-SSE-NEXT: psrlq %xmm5, %xmm4 -; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; X32-SSE-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE-NEXT: psrlq %xmm5, %xmm0 -; X32-SSE-NEXT: psrlq %xmm3, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm4, %xmm0 -; X32-SSE-NEXT: psubq %xmm4, %xmm0 +; X32-SSE-NEXT: xorps %xmm2, %xmm2 +; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X32-SSE-NEXT: psrad %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i32> %a, %splat @@ -1081,146 +1209,46 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad %xmm1, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad %xmm4, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm2, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psraw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pslld $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrad %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psraw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v4i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v4i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $16, %xmm0 -; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrad %xmm1, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad %xmm4, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrad %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrad %xmm2, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psraw %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer %shift = ashr <4 x i16> %a, %splat @@ -1230,366 +1258,188 @@ define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: xorpd %xmm2, %xmm0 -; SSE2-NEXT: psubq %xmm2, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psraw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: psrlq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psrlq %xmm2, %xmm4 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: psubq %xmm3, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psraw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v2i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $48, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad $31, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm4, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: psrlq %xmm4, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm2, %xmm0 -; X32-SSE-NEXT: psubq %xmm2, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psraw %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i16> %a, %splat ret <2 x i16> %shift } -define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $2, %xmm0 +define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $8, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: paddw %xmm2, %xmm4 -; SSE41-NEXT: psraw $15, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm4, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm4, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOP-LABEL: splatvar_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpsllw $8, %xmm0, %xmm0 -; XOP-NEXT: vpsraw $8, %xmm0, %xmm0 -; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psllw $8, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: psraw $8, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: psraw $15, %xmm0 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: pandn %xmm3, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: psraw $15, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: psraw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm0 +; X32-SSE-NEXT: psubb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer %shift = ashr <8 x i8> %a, %splat @@ -1599,133 +1449,139 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad %xmm1, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad %xmm4, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm2, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm1, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $24, %xmm0 -; X32-SSE-NEXT: psrad $24, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrad %xmm1, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad %xmm4, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrad %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrad %xmm2, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm0 +; X32-SSE-NEXT: psubb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer %shift = ashr <4 x i8> %a, %splat @@ -1735,138 +1591,130 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: xorpd %xmm2, %xmm0 -; SSE2-NEXT: psubq %xmm2, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm4, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlq %xmm4, %xmm3 -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubq %xmm2, %xmm0 +; SSE41-NEXT: psubb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: splatvar_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $56, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $56, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad $31, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X32-SSE-NEXT: psrad $24, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm4, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: psrlq %xmm4, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm2, %xmm0 -; X32-SSE-NEXT: psubq %xmm2, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm0 +; X32-SSE-NEXT: psubb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i8> %a, %splat @@ -1880,100 +1728,62 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $4, %xmm1 +; SSE2-NEXT: psrad $5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $4, %xmm0 -; SSE2-NEXT: psrlq $5, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd {{.*#+}} xmm0 = [576460752303423488,288230376151711744] -; SSE2-NEXT: xorpd %xmm0, %xmm1 -; SSE2-NEXT: psubq %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlq $5, %xmm0 -; SSE41-NEXT: psrlq $4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [576460752303423488,288230376151711744] -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: psubq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrad $5, %xmm1 +; SSE41-NEXT: psrad $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOP-LABEL: constant_shift_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: constant_shift_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: constant_shift_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5] -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: constant_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $32, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X32-SSE-NEXT: psrad $31, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrad $4, %xmm1 +; X32-SSE-NEXT: psrad $5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: psrlq $4, %xmm0 -; X32-SSE-NEXT: psrlq $5, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X32-SSE-NEXT: movapd {{.*#+}} xmm0 = [3.7857669957336791E-270,2.0522684006491881E-289] -; X32-SSE-NEXT: xorpd %xmm0, %xmm1 -; X32-SSE-NEXT: psubq %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i32> %a, ret <2 x i32> %shift @@ -1982,96 +1792,83 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $19, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $18, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrad $17, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] ; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pslld $16, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $19, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad $17, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrad $18, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $19, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $17, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrad $18, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: retq +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: retq ; -; AVX2-LABEL: constant_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq +; XOP-LABEL: constant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v4i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v4i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512BW-LABEL: constant_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512-LABEL: constant_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BWVL-LABEL: constant_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $16, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $16, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad $19, %xmm2 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrad $18, %xmm3 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; X32-SSE-NEXT: psrad $17, %xmm0 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] +; X32-SSE-NEXT: psraw $2, %xmm1 +; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] ; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: andps %xmm2, %xmm0 +; X32-SSE-NEXT: psraw $1, %xmm1 +; X32-SSE-NEXT: andnps %xmm1, %xmm2 +; X32-SSE-NEXT: orps %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i16> %a, ret <4 x i16> %shift @@ -2080,425 +1877,336 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $2, %xmm1 -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; SSE2-NEXT: xorpd %xmm1, %xmm0 -; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: psraw $3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlq $3, %xmm1 -; SSE41-NEXT: psrlq $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: psubq %xmm1, %xmm0 +; SSE41-NEXT: psraw $3, %xmm1 +; SSE41-NEXT: psraw $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: constant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: retq ; ; XOP-LABEL: constant_shift_v2i16: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $48, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3] -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: constant_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $48, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $31, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlq $2, %xmm1 -; X32-SSE-NEXT: psrlq $3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [1.4916681462400413E-154,1.2882297539194267E-231] -; X32-SSE-NEXT: xorpd %xmm1, %xmm0 -; X32-SSE-NEXT: psubq %xmm1, %xmm0 +; X32-SSE-NEXT: psraw $3, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pandn %xmm1, %xmm2 +; X32-SSE-NEXT: por %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i16> %a, ret <2 x i16> %shift } define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: psraw $12, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: psraw $1, %xmm1 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: constant_shift_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq ; -; SSE41-LABEL: constant_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = -; SSE41-NEXT: pmulhw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: psraw $9, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: retq +; AVX1-LABEL: constant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; AVX-LABEL: constant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpsraw $9, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: retq +; AVX2-LABEL: constant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v8i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllw $8, %xmm0, %xmm0 -; XOP-NEXT: vpsraw $8, %xmm0, %xmm0 -; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512DQ-LABEL: constant_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQVL-LABEL: constant_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $8, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psraw $8, %xmm1 -; X32-SSE-NEXT: psraw $12, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: movapd %xmm0, %xmm1 -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; X32-SSE-NEXT: psraw $2, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 -; X32-SSE-NEXT: andps %xmm2, %xmm0 -; X32-SSE-NEXT: psraw $1, %xmm1 -; X32-SSE-NEXT: andnps %xmm1, %xmm2 -; X32-SSE-NEXT: orps %xmm2, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i8> %a, ret <8 x i8> %shift } define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $27, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $26, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrad $25, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $24, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $27, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad $25, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrad $26, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; SSE41-NEXT: retq +; SSE-LABEL: constant_shift_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $27, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $25, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrad $26, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: constant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: constant_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $24, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $24, %xmm1 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad $27, %xmm2 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrad $26, %xmm3 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; X32-SSE-NEXT: psrad $25, %xmm0 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i8> %a, ret <4 x i8> %shift } define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $2, %xmm1 -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; SSE2-NEXT: xorpd %xmm1, %xmm0 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlq $3, %xmm1 -; SSE41-NEXT: psrlq $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: psubq %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: constant_shift_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $56, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3] -; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: constant_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $56, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $31, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE-NEXT: psrad $24, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlq $2, %xmm1 -; X32-SSE-NEXT: psrlq $3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [1.4916681462400413E-154,1.2882297539194267E-231] -; X32-SSE-NEXT: xorpd %xmm1, %xmm0 -; X32-SSE-NEXT: psubq %xmm1, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i8> %a, ret <2 x i8> %shift @@ -2509,88 +2217,34 @@ ; define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE2-LABEL: splatconstant_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: psrlq $5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatconstant_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrad $5, %xmm0 -; SSE41-NEXT: psrlq $5, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatconstant_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: retq +; SSE-LABEL: splatconstant_shift_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: psrad $5, %xmm0 +; SSE-NEXT: retq ; -; AVX2-LABEL: splatconstant_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrad $5, %xmm0, %xmm1 -; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: retq +; AVX-LABEL: splatconstant_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i32: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $37, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $37, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $32, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X32-SSE-NEXT: psrad $31, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: psrad $5, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X32-SSE-NEXT: psrlq $5, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE-NEXT: retl %shift = ashr <2 x i32> %a, ret <2 x i32> %shift @@ -2599,135 +2253,66 @@ define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v4i16: ; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $19, %xmm0 +; SSE-NEXT: psraw $3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $19, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i16: ; XOP: # %bb.0: -; XOP-NEXT: vpslld $16, %xmm0, %xmm0 -; XOP-NEXT: vpsrad $19, %xmm0, %xmm0 +; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $19, %xmm0, %xmm0 +; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $19, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $16, %xmm0 -; X32-SSE-NEXT: psrad $19, %xmm0 +; X32-SSE-NEXT: psraw $3, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i16> %a, ret <4 x i16> %shift } define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE2-LABEL: splatconstant_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatconstant_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $3, %xmm1 -; SSE41-NEXT: psrlq $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatconstant_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: retq +; SSE-LABEL: splatconstant_shift_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: psraw $3, %xmm0 +; SSE-NEXT: retq ; -; AVX2-LABEL: splatconstant_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrad $3, %xmm0, %xmm1 -; AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: retq +; AVX-LABEL: splatconstant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i16: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $48, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $51, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $51, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $48, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $31, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE-NEXT: psrlq $3, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: psraw $3, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i16> %a, ret <2 x i16> %shift @@ -2736,38 +2321,52 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v8i8: ; SSE: # %bb.0: -; SSE-NEXT: psllw $8, %xmm0 -; SSE-NEXT: psraw $11, %xmm0 +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $11, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v8i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllw $8, %xmm0, %xmm0 -; XOP-NEXT: vpsraw $11, %xmm0, %xmm0 +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $11, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraw $11, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $8, %xmm0 -; X32-SSE-NEXT: psraw $11, %xmm0 +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: pxor %xmm1, %xmm0 +; X32-SSE-NEXT: psubb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i8> %a, ret <8 x i8> %shift @@ -2776,135 +2375,106 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v4i8: ; SSE: # %bb.0: -; SSE-NEXT: pslld $24, %xmm0 -; SSE-NEXT: psrad $27, %xmm0 +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $27, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i8: ; XOP: # %bb.0: -; XOP-NEXT: vpslld $24, %xmm0, %xmm0 -; XOP-NEXT: vpsrad $27, %xmm0, %xmm0 +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $27, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrad $27, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $24, %xmm0 -; X32-SSE-NEXT: psrad $27, %xmm0 +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: pxor %xmm1, %xmm0 +; X32-SSE-NEXT: psubb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i8> %a, ret <4 x i8> %shift } define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE2-LABEL: splatconstant_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatconstant_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $3, %xmm1 -; SSE41-NEXT: psrlq $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatconstant_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: retq +; SSE-LABEL: splatconstant_shift_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq ; -; AVX2-LABEL: splatconstant_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrad $3, %xmm0, %xmm1 -; AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: retq +; AVX-LABEL: splatconstant_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $56, %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $59, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsraq $59, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $56, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $31, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE-NEXT: psrad $24, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE-NEXT: psrlq $3, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: pxor %xmm1, %xmm0 +; X32-SSE-NEXT: psubb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i8> %a, ret <2 x i8> %shift Index: llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll +++ llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll @@ -29,91 +29,87 @@ ; SSE-NEXT: movd %r9d, %xmm0 ; SSE-NEXT: movd %r8d, %xmm1 ; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: pmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero -; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero ; SSE-NEXT: .p2align 4, 0x90 ; SSE-NEXT: .LBB0_4: # %vector.body ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE-NEXT: pmovzxbw {{.*#+}} xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE-NEXT: pslld $24, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: pslld $24, %xmm0 -; SSE-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; SSE-NEXT: pslld $24, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: pslld $24, %xmm3 -; SSE-NEXT: pcmpeqw %xmm1, %xmm2 -; SSE-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE-NEXT: pslld $24, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: pslld $24, %xmm2 -; SSE-NEXT: pcmpeqw %xmm1, %xmm15 -; SSE-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero -; SSE-NEXT: pslld $24, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE-NEXT: pslld $24, %xmm15 -; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pslld %xmm14, %xmm4 -; SSE-NEXT: pslld %xmm13, %xmm6 -; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm6 +; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pmovsxbd %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE-NEXT: pcmpeqb %xmm1, %xmm3 +; SSE-NEXT: pmovsxbd %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-NEXT: pmovsxbd %xmm3, %xmm6 +; SSE-NEXT: pcmpeqb %xmm1, %xmm4 +; SSE-NEXT: pmovsxbd %xmm4, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] +; SSE-NEXT: pmovsxbd %xmm3, %xmm2 +; SSE-NEXT: pcmpeqb %xmm1, %xmm5 +; SSE-NEXT: pmovsxbd %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,3] +; SSE-NEXT: pmovsxbd %xmm3, %xmm9 +; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pslld %xmm15, %xmm4 +; SSE-NEXT: pslld %xmm14, %xmm3 +; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3 ; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pslld %xmm14, %xmm4 -; SSE-NEXT: pslld %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pslld %xmm15, %xmm5 +; SSE-NEXT: pslld %xmm14, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm10 ; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pslld %xmm15, %xmm5 +; SSE-NEXT: pslld %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12 -; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm13, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm3 +; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pslld %xmm15, %xmm5 +; SSE-NEXT: pslld %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6 ; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm13, %xmm1 +; SSE-NEXT: pslld %xmm15, %xmm5 +; SSE-NEXT: pslld %xmm14, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pslld %xmm15, %xmm2 ; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5 +; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pslld %xmm15, %xmm4 +; SSE-NEXT: pslld %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm2 -; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pslld %xmm14, %xmm7 -; SSE-NEXT: pslld %xmm13, %xmm5 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm5 -; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pslld %xmm15, %xmm7 ; SSE-NEXT: pslld %xmm14, %xmm4 -; SSE-NEXT: pslld %xmm13, %xmm7 ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm7 +; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4 ; SSE-NEXT: movups %xmm10, (%rdi,%rcx,4) -; SSE-NEXT: movups %xmm6, 16(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm3, 32(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4) ; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm2, 64(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4) ; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm7, 96(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm5, 112(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4) ; SSE-NEXT: addq $32, %rcx ; SSE-NEXT: cmpq %rcx, %rdx ; SSE-NEXT: jne .LBB0_4 @@ -179,33 +175,33 @@ ; AVX1-NEXT: # xmm1 = mem[0],zero,mem[1],zero ; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpcmpeqw %xmm11, %xmm3, %xmm3 -; AVX1-NEXT: vpmovsxwd %xmm3, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm11, %xmm4, %xmm4 -; AVX1-NEXT: vpmovsxwd %xmm4, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqw %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVX1-NEXT: vpcmpeqb %xmm11, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm11, %xmm4, %xmm4 +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm11, %xmm5, %xmm5 ; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9 ; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm10 ; AVX1-NEXT: vpslld %xmm1, %xmm9, %xmm0 ; AVX1-NEXT: vblendvps %xmm7, %xmm10, %xmm0, %xmm9 -; AVX1-NEXT: vpmovsxwd %xmm5, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqw %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm11, %xmm6, %xmm6 ; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm6, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 ; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm10 ; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2 ; AVX1-NEXT: vpslld %xmm15, %xmm2, %xmm3 Index: llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -20,103 +20,6 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: var_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: xorps %xmm3, %xmm3 -; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; X32-SSE-NEXT: psrlq %xmm3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: retl - %shift = lshr <2 x i32> %a, %b - ret <2 x i32> %shift -} - -define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrld %xmm2, %xmm3 @@ -135,93 +38,70 @@ ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v4i16: +; SSE41-LABEL: var_shift_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v4i16: +; AVX1-LABEL: var_shift_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v4i16: +; AVX2-LABEL: var_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v4i16: +; XOPAVX1-LABEL: var_shift_v2i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v4i16: +; XOPAVX2-LABEL: var_shift_v2i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v4i16: +; AVX512-LABEL: var_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v4i16: +; AVX512VL-LABEL: var_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; -; X32-SSE-LABEL: var_shift_v4i16: +; X32-SSE-LABEL: var_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psrld %xmm2, %xmm3 @@ -239,111 +119,21 @@ ; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; X32-SSE-NEXT: movaps %xmm2, %xmm0 ; X32-SSE-NEXT: retl - %shift = lshr <4 x i16> %a, %b - ret <4 x i16> %shift -} - -define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: retl - %shift = lshr <2 x i16> %a, %b - ret <2 x i16> %shift + %shift = lshr <2 x i32> %a, %b + ret <2 x i32> %shift } -define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v8i8: +define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psraw $15, %xmm2 @@ -369,20 +159,18 @@ ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v8i8: +; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 ; SSE41-NEXT: psllw $4, %xmm2 ; SSE41-NEXT: por %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrlw $8, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 @@ -402,16 +190,13 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v8i8: +; AVX1-LABEL: var_shift_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 @@ -423,11 +208,8 @@ ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v8i8: +; AVX2-LABEL: var_shift_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -437,31 +219,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v8i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v8i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: var_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ-LABEL: var_shift_v4i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -470,21 +236,17 @@ ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW-LABEL: var_shift_v4i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; -; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL-LABEL: var_shift_v4i16: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -492,22 +254,21 @@ ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; -; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL-LABEL: var_shift_v4i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; -; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE-LABEL: var_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: psraw $15, %xmm0 -; X32-SSE-NEXT: pandn %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 ; X32-SSE-NEXT: paddw %xmm1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2 ; X32-SSE-NEXT: psraw $15, %xmm2 @@ -532,138 +293,479 @@ ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: por %xmm2, %xmm0 ; X32-SSE-NEXT: retl - %shift = lshr <8 x i8> %a, %b - ret <8 x i8> %shift + %shift = lshr <4 x i16> %a, %b + ret <4 x i16> %shift } -define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v4i8: +define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrld %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v4i8: +; SSE41-LABEL: var_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v4i8: +; AVX2-LABEL: var_shift_v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: var_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: var_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: var_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: var_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $12, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: psraw $15, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i16> %a, %b + ret <2 x i16> %shift +} + +define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <8 x i8> %a, %b + ret <8 x i8> %shift +} + +define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $1, %xmm0 ; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrld %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld %xmm4, %xmm2 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrld %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrld %xmm1, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <4 x i8> %a, %b ret <4 x i8> %shift @@ -672,93 +774,148 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-LABEL: var_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq +; AVX-LABEL: var_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; AVX2-LABEL: var_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; XOP-LABEL: var_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v2i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; XOPAVX1-NEXT: # xmm2 = mem[0,0] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512DQ-LABEL: var_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v2i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512BW-LABEL: var_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512-LABEL: var_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQVL-LABEL: var_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BWVL-LABEL: var_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $1, %xmm0 ; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <2 x i8> %a, %b ret <2 x i8> %shift @@ -771,99 +928,46 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: psrld %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrld %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrlq %xmm2, %xmm3 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: xorps %xmm2, %xmm2 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-SSE-NEXT: psrlq %xmm2, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; X32-SSE-NEXT: psrld %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer %shift = lshr <2 x i32> %a, %splat @@ -873,139 +977,46 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm1, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld %xmm4, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm2, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v4i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v4i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld %xmm1, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrld %xmm4, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrld %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrld %xmm2, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer %shift = lshr <4 x i16> %a, %splat @@ -1015,302 +1026,168 @@ define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v2i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; -; X32-SSE-LABEL: splatvar_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer - %shift = lshr <2 x i16> %a, %splat - ret <2 x i16> %shift -} - -define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 +; X32-SSE-LABEL: splatvar_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %shift = lshr <2 x i16> %a, %splat + ret <2 x i16> %shift +} + +define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v8i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v8i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: psraw $15, %xmm0 -; X32-SSE-NEXT: pandn %xmm2, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: psraw $15, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer %shift = lshr <8 x i8> %a, %splat @@ -1320,132 +1197,119 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm1, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld %xmm4, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm2, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld %xmm1, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrld %xmm4, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrld %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrld %xmm2, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer %shift = lshr <4 x i8> %a, %splat @@ -1455,94 +1319,110 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: splatvar_shift_v2i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; XOPAVX1-NEXT: # xmm2 = mem[0,0] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: splatvar_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer %shift = lshr <2 x i8> %a, %splat @@ -1556,75 +1436,62 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $4, %xmm1 -; SSE2-NEXT: psrlq $5, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: psrld $5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlq $5, %xmm0 -; SSE41-NEXT: psrlq $4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $5, %xmm1 +; SSE41-NEXT: psrld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: constant_shift_v2i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_shift_v2i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: constant_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlq $4, %xmm1 -; X32-SSE-NEXT: psrlq $5, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-SSE-NEXT: psrld $4, %xmm1 +; X32-SSE-NEXT: psrld $5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <2 x i32> %a, ret <2 x i32> %shift @@ -1633,91 +1500,66 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $3, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: psrld $3, %xmm1 -; SSE41-NEXT: psrld $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v4i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: constant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v4i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: constant_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrld $3, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld $2, %xmm2 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrld $1, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <4 x i16> %a, ret <4 x i16> %shift @@ -1726,75 +1568,72 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $2, %xmm1 -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: psrlw $3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlq $3, %xmm0 -; SSE41-NEXT: psrlq $2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $3, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: retq +; AVX-LABEL: constant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: retq ; -; AVX2-LABEL: constant_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq +; XOP-LABEL: constant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v2i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v2i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512BW-LABEL: constant_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512-LABEL: constant_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BWVL-LABEL: constant_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlq $2, %xmm1 -; X32-SSE-NEXT: psrlq $3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-SSE-NEXT: psrlw $3, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pandn %xmm1, %xmm2 +; X32-SSE-NEXT: por %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <2 x i16> %a, ret <2 x i16> %shift @@ -1803,74 +1642,94 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = -; SSE41-NEXT: pmulhuw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: constant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: constant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v8i8: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512DQ-LABEL: constant_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQVL-LABEL: constant_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <8 x i8> %a, ret <8 x i8> %shift @@ -1879,85 +1738,94 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $3, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $3, %xmm0 -; SSE41-NEXT: psrld $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: constant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: constant_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrld $3, %xmm1 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld $2, %xmm2 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrld $1, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <4 x i8> %a, ret <4 x i8> %shift @@ -1966,67 +1834,94 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $2, %xmm1 -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlq $3, %xmm1 -; SSE41-NEXT: psrlq $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v2i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: constant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v2i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: constant_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlq $2, %xmm1 -; X32-SSE-NEXT: psrlq $3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <2 x i8> %a, ret <2 x i8> %shift @@ -2037,167 +1932,102 @@ ; define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE2-LABEL: splatconstant_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrlq $5, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatconstant_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: psrlq $5, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatconstant_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatconstant_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0 -; AVX2-NEXT: retq +; SSE-LABEL: splatconstant_shift_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: psrld $5, %xmm0 +; SSE-NEXT: retq ; -; XOPAVX1-LABEL: splatconstant_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; XOPAVX1-NEXT: vpsrlq $5, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatconstant_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatconstant_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; XOPAVX2-NEXT: vpsrlq $5, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatconstant_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpsrld $5, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512-NEXT: vpsrlq $5, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpsrlq $5, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlq $5, %xmm0 +; X32-SSE-NEXT: psrld $5, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <2 x i32> %a, ret <2 x i32> %shift } define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE2-LABEL: splatconstant_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrld $3, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatconstant_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; SSE41-NEXT: psrld $3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatconstant_shift_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpsrld $3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i16: ; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; XOP-NEXT: vpsrld $3, %xmm0, %xmm0 +; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrld $3, %xmm0 +; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <4 x i16> %a, ret <4 x i16> %shift } define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE2-LABEL: splatconstant_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatconstant_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: psrlq $3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatconstant_shift_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i16: ; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0 +; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlq $3, %xmm0 +; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <2 x i16> %a, ret <2 x i16> %shift @@ -2206,38 +2036,37 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v8i8: ; SSE: # %bb.0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v8i8: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <8 x i8> %a, ret <8 x i8> %shift @@ -2246,38 +2075,37 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v4i8: ; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: psrld $3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v4i8: ; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrld $3, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i8: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpsrld $3, %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i8: ; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v4i8: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrld $3, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <4 x i8> %a, ret <4 x i8> %shift @@ -2286,38 +2114,37 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v2i8: ; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: psrlq $3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v2i8: ; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i8: ; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i8: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlq $3, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <2 x i8> %a, ret <2 x i8> %shift Index: llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -20,88 +20,6 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: var_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psllq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: xorps %xmm3, %xmm3 -; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; X32-SSE-NEXT: psllq %xmm3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: retl - %shift = shl <2 x i32> %a, %b - ret <2 x i32> %shift -} - -define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v4i16: -; SSE2: # %bb.0: ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 @@ -114,7 +32,7 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v4i16: +; SSE41-LABEL: var_shift_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pslld $23, %xmm1 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 @@ -122,7 +40,7 @@ ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v4i16: +; AVX1-LABEL: var_shift_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 @@ -130,42 +48,32 @@ ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v4i16: +; AVX2-LABEL: var_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v4i16: +; XOPAVX1-LABEL: var_shift_v2i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v4i16: +; XOPAVX2-LABEL: var_shift_v2i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v4i16: +; AVX512-LABEL: var_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v4i16: +; AVX512VL-LABEL: var_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; -; X32-SSE-LABEL: var_shift_v4i16: +; X32-SSE-LABEL: var_shift_v2i32: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: pslld $23, %xmm1 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 @@ -178,94 +86,13 @@ ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE-NEXT: retl - %shift = shl <4 x i16> %a, %b - ret <4 x i16> %shift -} - -define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psllq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psllq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: retl - %shift = shl <2 x i16> %a, %b - ret <2 x i16> %shift + %shift = shl <2 x i32> %a, %b + ret <2 x i32> %shift } -define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v8i8: +define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -287,17 +114,14 @@ ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v8i8: +; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd %xmm3, %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 @@ -305,15 +129,14 @@ ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v8i8: +; AVX1-LABEL: var_shift_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 @@ -321,11 +144,10 @@ ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v8i8: +; AVX2-LABEL: var_shift_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -333,51 +155,46 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOP-LABEL: var_shift_v8i8: +; XOP-LABEL: var_shift_v4i16: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ-LABEL: var_shift_v4i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW-LABEL: var_shift_v4i16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; -; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL-LABEL: var_shift_v4i16: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; -; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL-LABEL: var_shift_v4i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; -; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE-LABEL: var_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -398,160 +215,566 @@ ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; X32-SSE-NEXT: pmullw %xmm1, %xmm0 ; X32-SSE-NEXT: retl - %shift = shl <8 x i8> %a, %b - ret <8 x i8> %shift + %shift = shl <4 x i16> %a, %b + ret <4 x i16> %shift } -define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v4i8: +define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; SSE41-LABEL: var_shift_v4i8: +; SSE41-LABEL: var_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v4i8: +; AVX1-LABEL: var_shift_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: var_shift_v4i8: +; AVX2-LABEL: var_shift_v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: var_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: var_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: var_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: var_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq ; -; X32-SSE-LABEL: var_shift_v4i8: +; AVX512BWVL-LABEL: var_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-SSE-NEXT: pslld $23, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X32-SSE-NEXT: paddd %xmm4, %xmm3 +; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X32-SSE-NEXT: pslld $23, %xmm1 -; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: paddd %xmm4, %xmm1 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X32-SSE-NEXT: pmullw %xmm1, %xmm0 ; X32-SSE-NEXT: retl - %shift = shl <4 x i8> %a, %b - ret <4 x i8> %shift + %shift = shl <2 x i16> %a, %b + ret <2 x i16> %shift +} + +define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <8 x i8> %a, %b + ret <8 x i8> %shift +} + +define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <4 x i8> %a, %b + ret <4 x i8> %shift } define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-LABEL: var_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psllq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: var_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq +; AVX-LABEL: var_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; AVX2-LABEL: var_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; XOP-LABEL: var_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX1-LABEL: var_shift_v2i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512DQ-LABEL: var_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; XOPAVX2-LABEL: var_shift_v2i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512BW-LABEL: var_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512-LABEL: var_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQVL-LABEL: var_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BWVL-LABEL: var_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psllq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psllq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <2 x i8> %a, %b ret <2 x i8> %shift @@ -564,88 +787,46 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: pslld %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pslld %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: xorps %xmm2, %xmm2 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-SSE-NEXT: psllq %xmm2, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; X32-SSE-NEXT: pslld %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer %shift = shl <2 x i32> %a, %splat @@ -655,90 +836,46 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v4i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v4i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pslld $23, %xmm1 -; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer %shift = shl <4 x i16> %a, %splat @@ -748,87 +885,46 @@ define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatvar_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v2i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: splatvar_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psllq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psllq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer %shift = shl <2 x i16> %a, %splat @@ -838,140 +934,114 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: pslld $23, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE41-NEXT: packusdw %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOP-LABEL: splatvar_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psllw %xmm1, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X32-SSE-NEXT: pslld $23, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; X32-SSE-NEXT: paddd %xmm4, %xmm3 -; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE-NEXT: pslld $23, %xmm1 -; X32-SSE-NEXT: paddd %xmm4, %xmm1 -; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X32-SSE-NEXT: pmullw %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer %shift = shl <8 x i8> %a, %splat @@ -981,82 +1051,114 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i8: ; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i8: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psllw %xmm1, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: pslld $23, %xmm1 -; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer %shift = shl <4 x i8> %a, %splat @@ -1066,73 +1168,107 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psllq %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: splatvar_shift_v2i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: splatvar_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: splatvar_shift_v2i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: splatvar_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psllq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: psllq %xmm1, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psllw %xmm1, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer %shift = shl <2 x i8> %a, %splat @@ -1147,120 +1283,109 @@ ; SSE2-LABEL: constant_shift_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq $4, %xmm1 -; SSE2-NEXT: psllq $5, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: pslld $4, %xmm1 +; SSE2-NEXT: pslld $5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $5, %xmm1 -; SSE41-NEXT: psllq $4, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pslld $5, %xmm1 +; SSE41-NEXT: pslld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $5, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpslld $5, %xmm0, %xmm1 +; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: constant_shift_v2i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_shift_v2i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: constant_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i32: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psllq $4, %xmm1 -; X32-SSE-NEXT: psllq $5, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-SSE-NEXT: pslld $4, %xmm1 +; X32-SSE-NEXT: pslld $5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <2 x i32> %a, ret <2 x i32> %shift } define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE2-LABEL: constant_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: constant_shift_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; -; AVX1-LABEL: constant_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq ; -; AVX2-LABEL: constant_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq +; XOP-LABEL: constant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v4i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v4i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512BW-LABEL: constant_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512-LABEL: constant_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BWVL-LABEL: constant_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <4 x i16> %a, ret <4 x i16> %shift @@ -1269,106 +1394,149 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq $2, %xmm1 -; SSE2-NEXT: psllq $3, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $3, %xmm1 -; SSE41-NEXT: psllq $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: psllw $3, %xmm1 +; SSE41-NEXT: psllw $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: retq +; AVX-LABEL: constant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: retq ; -; AVX2-LABEL: constant_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq +; XOP-LABEL: constant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v2i16: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v2i16: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512BW-LABEL: constant_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512-LABEL: constant_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512BWVL-LABEL: constant_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psllq $2, %xmm1 -; X32-SSE-NEXT: psllq $3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <2 x i16> %a, ret <2 x i16> %shift } define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE-LABEL: constant_shift_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: constant_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: constant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: constant_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v8i8: ; XOP: # %bb.0: -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512DQ-LABEL: constant_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQVL-LABEL: constant_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i8: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <8 x i8> %a, ret <8 x i8> %shift @@ -1377,61 +1545,87 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: constant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: constant_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <4 x i8> %a, ret <4 x i8> %shift @@ -1440,58 +1634,87 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq $2, %xmm1 -; SSE2-NEXT: psllq $3, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $3, %xmm1 -; SSE41-NEXT: psllq $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: constant_shift_v2i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; XOP-LABEL: constant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; -; XOPAVX2-LABEL: constant_shift_v2i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512-LABEL: constant_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: constant_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psllq $2, %xmm1 -; X32-SSE-NEXT: psllq $3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <2 x i8> %a, ret <2 x i8> %shift @@ -1504,32 +1727,32 @@ define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v2i32: ; SSE: # %bb.0: -; SSE-NEXT: psllq $5, %xmm0 +; SSE-NEXT: pslld $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpsllq $5, %xmm0, %xmm0 +; AVX-NEXT: vpslld $5, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i32: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $5, %xmm0, %xmm0 +; XOP-NEXT: vpslld $5, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $5, %xmm0, %xmm0 +; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $5, %xmm0, %xmm0 +; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $5, %xmm0 +; X32-SSE-NEXT: pslld $5, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <2 x i32> %a, ret <2 x i32> %shift @@ -1538,32 +1761,32 @@ define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v4i16: ; SSE: # %bb.0: -; SSE-NEXT: pslld $3, %xmm0 +; SSE-NEXT: psllw $3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $3, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i16: ; XOP: # %bb.0: -; XOP-NEXT: vpslld $3, %xmm0, %xmm0 +; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $3, %xmm0, %xmm0 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $3, %xmm0 +; X32-SSE-NEXT: psllw $3, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <4 x i16> %a, ret <4 x i16> %shift @@ -1572,32 +1795,32 @@ define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v2i16: ; SSE: # %bb.0: -; SSE-NEXT: psllq $3, %xmm0 +; SSE-NEXT: psllw $3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vpsllq $3, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i16: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $3, %xmm0, %xmm0 +; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $3, %xmm0 +; X32-SSE-NEXT: psllw $3, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <2 x i16> %a, ret <2 x i16> %shift @@ -1607,31 +1830,36 @@ ; SSE-LABEL: splatconstant_shift_v8i8: ; SSE: # %bb.0: ; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v8i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v8i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <8 x i8> %a, ret <8 x i8> %shift @@ -1640,32 +1868,37 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v4i8: ; SSE: # %bb.0: -; SSE-NEXT: pslld $3, %xmm0 +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $3, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i8: ; XOP: # %bb.0: -; XOP-NEXT: vpslld $3, %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $3, %xmm0, %xmm0 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $3, %xmm0 +; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <4 x i8> %a, ret <4 x i8> %shift @@ -1674,32 +1907,37 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE-LABEL: splatconstant_shift_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: psllq $3, %xmm0 +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpsllq $3, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: vpsllq $3, %xmm0, %xmm0 +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllq $3, %xmm0 +; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = shl <2 x i8> %a, ret <2 x i8> %shift Index: llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -841,26 +841,14 @@ } define <16 x i8> @PR20540(<8 x i8> %a) { -; SSE2-LABEL: PR20540: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR20540: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR20540: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq +; SSE-LABEL: PR20540: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: retq ; ; AVX-LABEL: PR20540: ; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle Index: llvm/test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1715,41 +1715,37 @@ ; SSE2-LABEL: combine_test1c: ; SSE2: # %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test1c: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test1c: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test1c: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test1c: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-NEXT: retq +; AVX-LABEL: combine_test1c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %A = load <4 x i8>, <4 x i8>* %a %B = load <4 x i8>, <4 x i8>* %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -1758,40 +1754,18 @@ } define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { -; SSE2-LABEL: combine_test2c: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: combine_test2c: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: combine_test2c: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq +; SSE-LABEL: combine_test2c: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq ; ; AVX-LABEL: combine_test2c: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: retq %A = load <4 x i8>, <4 x i8>* %a %B = load <4 x i8>, <4 x i8>* %b @@ -1801,40 +1775,20 @@ } define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { -; SSE2-LABEL: combine_test3c: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: combine_test3c: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: combine_test3c: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE41-NEXT: retq +; SSE-LABEL: combine_test3c: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: retq ; ; AVX-LABEL: combine_test3c: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX-NEXT: retq %A = load <4 x i8>, <4 x i8>* %a %B = load <4 x i8>, <4 x i8>* %b @@ -1846,48 +1800,38 @@ define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: combine_test4c: ; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test4c: ; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test4c: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test4c: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test4c: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-NEXT: retq +; AVX-LABEL: combine_test4c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %A = load <4 x i8>, <4 x i8>* %a %B = load <4 x i8>, <4 x i8>* %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> Index: llvm/test/CodeGen/X86/vector-trunc-packus.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -1411,6 +1411,7 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: packuswb %xmm3, %xmm2 ; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i8: @@ -1517,6 +1518,7 @@ ; SSSE3-NEXT: pand %xmm5, %xmm2 ; SSSE3-NEXT: packuswb %xmm3, %xmm2 ; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i8: @@ -1615,6 +1617,7 @@ ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 ; SSE41-NEXT: packusdw %xmm4, %xmm3 ; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1643,25 +1646,33 @@ ; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i64_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1670,7 +1681,7 @@ ; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -2041,14 +2052,16 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2777,84 +2790,25 @@ } define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { -; SSE2-LABEL: trunc_packus_v8i32_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v8i32_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v8i32_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE41-NEXT: pminsd %xmm2, %xmm0 -; SSE41-NEXT: pminsd %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmaxsd %xmm2, %xmm1 -; SSE41-NEXT: pmaxsd %xmm2, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: trunc_packus_v8i32_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i32_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2864,8 +2818,7 @@ ; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2874,7 +2827,7 @@ ; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2884,8 +2837,7 @@ ; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2894,7 +2846,7 @@ ; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -2937,8 +2889,7 @@ ; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2957,8 +2908,7 @@ ; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/vector-trunc-ssat.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -1178,7 +1178,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] ; SSE2-NEXT: movdqa %xmm9, %xmm7 @@ -1189,23 +1189,10 @@ ; SSE2-NEXT: pand %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm9, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 @@ -1215,85 +1202,104 @@ ; SSE2-NEXT: pand %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 ; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm8, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm4, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] ; SSSE3-NEXT: movdqa %xmm9, %xmm7 @@ -1304,23 +1310,10 @@ ; SSSE3-NEXT: pand %xmm10, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm3 ; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 @@ -1330,223 +1323,267 @@ ; SSSE3-NEXT: pand %xmm10, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm2 ; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] ; SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm7 ; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm0 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm5, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm3, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm5 ; SSSE3-NEXT: pandn %xmm8, %xmm1 ; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: packssdw %xmm3, %xmm1 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm0, %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [127,127] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm11, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm10, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm11 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm2 +; SSE41-NEXT: movdqa %xmm10, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] ; SSE41-NEXT: movapd %xmm7, %xmm0 ; SSE41-NEXT: xorpd %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] ; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 -; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 ; SSE41-NEXT: xorpd %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: packssdw %xmm6, %xmm1 -; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movapd %xmm11, %xmm0 ; SSE41-NEXT: xorpd %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 -; SSE41-NEXT: xorpd %xmm8, %xmm5 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm5 ; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: packssdw %xmm2, %xmm3 -; SSE41-NEXT: packssdw %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: andpd %xmm0, %xmm2 +; SSE41-NEXT: andpd %xmm0, %xmm3 +; SSE41-NEXT: packusdw %xmm2, %xmm3 +; SSE41-NEXT: andpd %xmm0, %xmm7 +; SSE41-NEXT: andpd %xmm0, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11 +; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8 +; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i64_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1554,7 +1591,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -1971,14 +2008,16 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2739,92 +2778,25 @@ } define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { -; SSE2-LABEL: trunc_ssat_v8i32_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_ssat_v8i32_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: packssdw %xmm2, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v8i32_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127] -; SSE41-NEXT: pminsd %xmm2, %xmm0 -; SSE41-NEXT: pminsd %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168] -; SSE41-NEXT: pmaxsd %xmm2, %xmm1 -; SSE41-NEXT: pmaxsd %xmm2, %xmm0 -; SSE41-NEXT: packssdw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: trunc_ssat_v8i32_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i32_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2834,8 +2806,7 @@ ; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] ; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2843,7 +2814,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2853,8 +2824,7 @@ ; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] ; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2862,7 +2832,7 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -2905,8 +2875,7 @@ ; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] ; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2923,8 +2892,7 @@ ; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] ; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/vector-trunc-usat.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -949,55 +949,56 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { ; SSE2-LABEL: trunc_usat_v8i64_v8i8: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm9, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pandn %xmm8, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm1 @@ -1006,59 +1007,61 @@ ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: packuswb %xmm4, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v8i64_v8i8: ; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm5, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 ; SSSE3-NEXT: movdqa %xmm9, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: pandn %xmm8, %xmm4 ; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm6 ; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: pand %xmm3, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm1 @@ -1067,6 +1070,7 @@ ; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: packuswb %xmm4, %xmm1 ; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8: @@ -1120,6 +1124,7 @@ ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: packusdw %xmm4, %xmm5 ; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1145,6 +1150,7 @@ ; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1152,24 +1158,31 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_usat_v8i64_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp ult <8 x i64> %a0, @@ -1399,14 +1412,16 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1856,59 +1871,70 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; SSE2-LABEL: trunc_usat_v8i32_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm3, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v8i32_v8i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pand %xmm6, %xmm0 ; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: packuswb %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i32_v8i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1917,7 +1943,10 @@ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1925,15 +1954,14 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v8i32_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1941,15 +1969,14 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp ult <8 x i32> %a0, @@ -1998,32 +2025,35 @@ ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm2, %xmm5 ; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; SSSE3-NEXT: pshufb %xmm0, %xmm5 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSSE3-NEXT: movq %xmm6, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i32_v8i8_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2033,8 +2063,10 @@ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2043,8 +2075,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2059,8 +2090,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/vector-trunc.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc.ll +++ llvm/test/CodeGen/X86/vector-trunc.ll @@ -296,32 +296,22 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc8i64_8i8: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc8i64_8i8: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vmovq %xmm0, (%rax) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: trunc8i64_8i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i8: ; AVX512: # %bb.0: # %entry @@ -587,9 +577,11 @@ ; ; AVX2-LABEL: trunc8i32_8i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -597,8 +589,7 @@ ; AVX512F-LABEL: trunc8i32_8i8: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -612,8 +603,7 @@ ; AVX512BW-LABEL: trunc8i32_8i8: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1480,39 +1470,53 @@ ; ; AVX1-LABEL: trunc2x4i64_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc2x4i64_8i16: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc2x4i64_8i16: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1520,22 +1524,16 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc2x4i64_8i16: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1544,21 +1542,17 @@ ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2x4i64_8i16: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm2 -; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14] -; AVX512BWVL-NEXT: vpermi2w %xmm1, %xmm2, %xmm0 +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: Index: llvm/test/CodeGen/X86/vector-truncate-combine.ll =================================================================== --- llvm/test/CodeGen/X86/vector-truncate-combine.ll +++ llvm/test/CodeGen/X86/vector-truncate-combine.ll @@ -14,12 +14,8 @@ ; NOTE: This operation could be collapsed in to a single truncate. Once that is done ; this test will have to be adjusted. -; CHECK: PUNPCKLBWrr -; CHECK: PUNPCKLWDrr ; CHECK: PANDrm ; CHECK: PACKUSWBrr -; CHECK: PACKUSWBrr -; CHECK: PACKUSWBrr ; CHECK: MOVPDI2DIrr define void @test(double %vec.coerce) local_unnamed_addr { Index: llvm/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/test/CodeGen/X86/vector-zext.ll +++ llvm/test/CodeGen/X86/vector-zext.ll @@ -397,16 +397,15 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_16i8_to_8i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -414,15 +413,13 @@ ; ; SSSE3-LABEL: zext_16i8_to_8i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: pshufb %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: pshufb %xmm5, %xmm3 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_8i64: @@ -1159,16 +1156,15 @@ define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) { ; SSE2-LABEL: load_zext_8i8_to_8i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -1176,16 +1172,14 @@ ; ; SSSE3-LABEL: load_zext_8i8_to_8i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: pshufb %xmm5, %xmm1 +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: pshufb %xmm5, %xmm3 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i8_to_8i64: @@ -1508,8 +1502,8 @@ ; SSE2-LABEL: zext_8i8_to_8i32: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] @@ -1518,8 +1512,8 @@ ; SSSE3-LABEL: zext_8i8_to_8i32: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] @@ -1527,32 +1521,28 @@ ; ; SSE41-LABEL: zext_8i8_to_8i32: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i8_to_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i8_to_8i32: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_8i8_to_8i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512-NEXT: retq entry: %t = zext <8 x i8> %z to <8 x i32> @@ -1659,8 +1649,8 @@ ; SSE2-LABEL: shuf_zext_8i8_to_8i32: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] @@ -1669,8 +1659,8 @@ ; SSSE3-LABEL: shuf_zext_8i8_to_8i32: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] @@ -1678,7 +1668,6 @@ ; ; SSE41-LABEL: shuf_zext_8i8_to_8i32: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -1687,7 +1676,6 @@ ; ; AVX1-LABEL: shuf_zext_8i8_to_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -1696,13 +1684,11 @@ ; ; AVX2-LABEL: shuf_zext_8i8_to_8i32: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: retq ; ; AVX512-LABEL: shuf_zext_8i8_to_8i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512-NEXT: retq entry: @@ -1731,33 +1717,11 @@ ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; -; AVX1-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: retq +; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> %Z = bitcast <16 x i8> %B to <2 x i64> @@ -1841,33 +1805,11 @@ ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: retq ; -; AVX1-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: retq +; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> %Z = bitcast <8 x i16> %B to <2 x i64> @@ -2278,11 +2220,11 @@ ; AVX2-LABEL: zext_32i8_to_32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm4, %ymm0 ; AVX2-NEXT: retq @@ -2306,28 +2248,33 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_2i8_to_2i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[3],zero,zero,zero -; SSSE3-NEXT: paddq %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: paddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_2i8_to_2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: zext_2i8_to_2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load <2 x i8>, <2 x i8>* %addr, align 1 %y = zext <2 x i8> %x to <2 x i32> @@ -2636,7 +2583,8 @@ ; ; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -2732,7 +2680,8 @@ ; ; AVX1-LABEL: splatshuf_zext_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; Index: llvm/test/CodeGen/X86/vsel-cmp-load.ll =================================================================== --- llvm/test/CodeGen/X86/vsel-cmp-load.ll +++ llvm/test/CodeGen/X86/vsel-cmp-load.ll @@ -8,12 +8,12 @@ define <8 x i32> @eq_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) { ; AVX1-LABEL: eq_zero: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -28,8 +28,8 @@ ; ; AVX512-LABEL: eq_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vptestnmw %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; AVX512-NEXT: retq %load = load <8 x i8>, <8 x i8>* %p @@ -41,14 +41,14 @@ define <4 x i64> @ne_zero(<4 x i16>* %p, <4 x i64> %x, <4 x i64> %y) { ; AVX1-LABEL: ne_zero: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -63,8 +63,8 @@ ; ; AVX512-LABEL: ne_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k1 ; AVX512-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; AVX512-NEXT: retq %load = load <4 x i16>, <4 x i16>* %p @@ -111,10 +111,10 @@ define <8 x i32> @slt_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) { ; AVX1-LABEL: slt_zero: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbw (%rdi), %xmm2 -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -127,8 +127,8 @@ ; ; AVX512-LABEL: slt_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbw (%rdi), %xmm2 -; AVX512-NEXT: vpmovw2m %xmm2, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vpmovb2m %xmm2, %k1 ; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; AVX512-NEXT: retq %load = load <8 x i8>, <8 x i8>* %p @@ -140,12 +140,12 @@ define <4 x double> @eq_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x double> %y) { ; AVX1-LABEL: eq_zero_fp_select: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -160,8 +160,8 @@ ; ; AVX512-LABEL: eq_zero_fp_select: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX512-NEXT: vptestnmd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; AVX512-NEXT: retq %load = load <4 x i8>, <4 x i8>* %p @@ -173,14 +173,14 @@ define <8 x float> @ne_zero_fp_select(<8 x i8>* %p, <8 x float> %x, <8 x float> %y) { ; AVX1-LABEL: ne_zero_fp_select: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -195,8 +195,8 @@ ; ; AVX512-LABEL: ne_zero_fp_select: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vptestmb %xmm2, %xmm2, %k1 ; AVX512-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; AVX512-NEXT: retq %load = load <8 x i8>, <8 x i8>* %p @@ -208,12 +208,12 @@ define <4 x double> @sgt_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x double> %y) { ; AVX1-LABEL: sgt_zero_fp_select: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm2 +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -228,9 +228,9 @@ ; ; AVX512-LABEL: sgt_zero_fp_select: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd (%rdi), %xmm2 +; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; AVX512-NEXT: vpcmpgtb %xmm3, %xmm2, %k1 ; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; AVX512-NEXT: retq %load = load <4 x i8>, <4 x i8>* %p Index: llvm/test/CodeGen/X86/vselect-avx.ll =================================================================== --- llvm/test/CodeGen/X86/vselect-avx.ll +++ llvm/test/CodeGen/X86/vselect-avx.ll @@ -96,11 +96,12 @@ ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: vmovq %xmm1, (%rsi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test3: @@ -118,11 +119,12 @@ ; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: vmovq %xmm1, (%rsi) ; AVX2-NEXT: retq %tmp6 = srem <4 x i32> %induction30, %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer Index: llvm/test/CodeGen/X86/vselect.ll =================================================================== --- llvm/test/CodeGen/X86/vselect.ll +++ llvm/test/CodeGen/X86/vselect.ll @@ -567,36 +567,43 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; SSE2-LABEL: simplify_select: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: ; SSE41: # %bb.0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: movd %edi, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE41-NEXT: pinsrd $1, %edi, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: simplify_select: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: simplify_select: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: simplify_select: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX-NEXT: vmovd %edi, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 %b = insertelement <2 x i32> , i32 %x, i32 0 %y = or <2 x i32> %a, %b @@ -647,34 +654,28 @@ ; SSE2-LABEL: vselect_any_extend_vector_inreg_crash: ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pcmpeqw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $56, %xmm0 +; SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: andl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vselect_any_extend_vector_inreg_crash: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movl $32768, %eax # imm = 0x8000 -; SSE41-NEXT: movq %rax, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: andl $32768, %eax # imm = 0x8000 ; SSE41-NEXT: retq ; ; AVX-LABEL: vselect_any_extend_vector_inreg_crash: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: andl $32768, %eax # imm = 0x8000 ; AVX-NEXT: retq Index: llvm/test/CodeGen/X86/vshift-4.ll =================================================================== --- llvm/test/CodeGen/X86/vshift-4.ll +++ llvm/test/CodeGen/X86/vshift-4.ll @@ -58,7 +58,7 @@ ; X32-LABEL: shift2a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -67,7 +67,7 @@ ; ; X64-LABEL: shift2a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -84,7 +84,7 @@ ; X32-LABEL: shift2b: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -93,7 +93,7 @@ ; ; X64-LABEL: shift2b: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 @@ -110,7 +110,7 @@ ; X32-LABEL: shift2c: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X32-NEXT: pslld %xmm2, %xmm0 @@ -119,7 +119,7 @@ ; ; X64-LABEL: shift2c: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; X64-NEXT: pslld %xmm2, %xmm0 Index: llvm/test/CodeGen/X86/widen_arith-1.ll =================================================================== --- llvm/test/CodeGen/X86/widen_arith-1.ll +++ llvm/test/CodeGen/X86/widen_arith-1.ll @@ -4,10 +4,9 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind { ; CHECK-LABEL: update: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: pushl %eax ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -19,15 +18,14 @@ ; CHECK-NEXT: movl (%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: pmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; CHECK-NEXT: psubd %xmm0, %xmm2 -; CHECK-NEXT: pextrb $8, %xmm2, 2(%ecx,%eax,4) -; CHECK-NEXT: pshufb %xmm1, %xmm2 -; CHECK-NEXT: pextrw $0, %xmm2, (%ecx,%eax,4) +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: psubb %xmm0, %xmm1 +; CHECK-NEXT: pextrb $2, %xmm1, 2(%ecx,%eax,4) +; CHECK-NEXT: pextrw $0, %xmm1, (%ecx,%eax,4) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .LBB0_3: # %afterfor -; CHECK-NEXT: addl $12, %esp +; CHECK-NEXT: popl %eax ; CHECK-NEXT: retl entry: %dst.addr = alloca <3 x i8>* Index: llvm/test/CodeGen/X86/widen_arith-2.ll =================================================================== --- llvm/test/CodeGen/X86/widen_arith-2.ll +++ llvm/test/CodeGen/X86/widen_arith-2.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: subl $12, %esp ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4,4,4,4,4,4,4,4,u,u,u,u,u,u,u,u> ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -25,10 +25,9 @@ ; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; CHECK-NEXT: psubw %xmm0, %xmm2 +; CHECK-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: psubb %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: packuswb %xmm0, %xmm2 ; CHECK-NEXT: movq %xmm2, (%edx,%eax,8) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: jmp .LBB0_1 Index: llvm/test/CodeGen/X86/widen_arith-3.ll =================================================================== --- llvm/test/CodeGen/X86/widen_arith-3.ll +++ llvm/test/CodeGen/X86/widen_arith-3.ll @@ -12,8 +12,7 @@ ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $32, %esp -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $65537, {{[0-9]+}}(%esp) # imm = 0x10001 @@ -29,11 +28,11 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl 12(%ebp), %edx ; CHECK-NEXT: movl 8(%ebp), %ecx -; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; CHECK-NEXT: psubd %xmm0, %xmm2 -; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8) -; CHECK-NEXT: pshufb %xmm1, %xmm2 -; CHECK-NEXT: movd %xmm2, (%ecx,%eax,8) +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: pinsrw $2, 4(%edx,%eax,8), %xmm1 +; CHECK-NEXT: psubw %xmm0, %xmm1 +; CHECK-NEXT: pextrw $2, %xmm1, 4(%ecx,%eax,8) +; CHECK-NEXT: movd %xmm1, (%ecx,%eax,8) ; CHECK-NEXT: incl {{[0-9]+}}(%esp) ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .LBB0_3: # %afterfor Index: llvm/test/CodeGen/X86/widen_bitops-0.ll =================================================================== --- llvm/test/CodeGen/X86/widen_bitops-0.ll +++ llvm/test/CodeGen/X86/widen_bitops-0.ll @@ -132,15 +132,15 @@ ; X32-SSE-LABEL: and_v3i8_as_i24: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 -; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0 ; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1 -; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: pand %xmm0, %xmm1 ; X32-SSE-NEXT: pextrb $0, %xmm1, %eax -; X32-SSE-NEXT: pextrb $4, %xmm1, %edx -; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X32-SSE-NEXT: pextrb $1, %xmm1, %edx +; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx ; X32-SSE-NEXT: # kill: def $al killed $al killed $eax ; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx ; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx @@ -149,15 +149,15 @@ ; X64-SSE-LABEL: and_v3i8_as_i24: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0 -; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0 +; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0 +; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0 ; X64-SSE-NEXT: movd %edi, %xmm1 -; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1 -; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1 +; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1 +; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1 ; X64-SSE-NEXT: pand %xmm0, %xmm1 ; X64-SSE-NEXT: pextrb $0, %xmm1, %eax -; X64-SSE-NEXT: pextrb $4, %xmm1, %edx -; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X64-SSE-NEXT: pextrb $1, %xmm1, %edx +; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx ; X64-SSE-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx ; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx @@ -173,15 +173,15 @@ ; X32-SSE-LABEL: xor_v3i8_as_i24: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 -; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0 ; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1 -; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: pxor %xmm0, %xmm1 ; X32-SSE-NEXT: pextrb $0, %xmm1, %eax -; X32-SSE-NEXT: pextrb $4, %xmm1, %edx -; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X32-SSE-NEXT: pextrb $1, %xmm1, %edx +; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx ; X32-SSE-NEXT: # kill: def $al killed $al killed $eax ; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx ; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx @@ -190,15 +190,15 @@ ; X64-SSE-LABEL: xor_v3i8_as_i24: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0 -; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0 +; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0 +; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0 ; X64-SSE-NEXT: movd %edi, %xmm1 -; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1 -; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1 +; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1 +; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1 ; X64-SSE-NEXT: pxor %xmm0, %xmm1 ; X64-SSE-NEXT: pextrb $0, %xmm1, %eax -; X64-SSE-NEXT: pextrb $4, %xmm1, %edx -; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X64-SSE-NEXT: pextrb $1, %xmm1, %edx +; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx ; X64-SSE-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx ; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx @@ -214,15 +214,15 @@ ; X32-SSE-LABEL: or_v3i8_as_i24: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 -; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0 ; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1 -; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: por %xmm0, %xmm1 ; X32-SSE-NEXT: pextrb $0, %xmm1, %eax -; X32-SSE-NEXT: pextrb $4, %xmm1, %edx -; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X32-SSE-NEXT: pextrb $1, %xmm1, %edx +; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx ; X32-SSE-NEXT: # kill: def $al killed $al killed $eax ; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx ; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx @@ -231,15 +231,15 @@ ; X64-SSE-LABEL: or_v3i8_as_i24: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0 -; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0 +; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0 +; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0 ; X64-SSE-NEXT: movd %edi, %xmm1 -; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1 -; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1 +; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1 +; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1 ; X64-SSE-NEXT: por %xmm0, %xmm1 ; X64-SSE-NEXT: pextrb $0, %xmm1, %eax -; X64-SSE-NEXT: pextrb $4, %xmm1, %edx -; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X64-SSE-NEXT: pextrb $1, %xmm1, %edx +; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx ; X64-SSE-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx ; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx Index: llvm/test/CodeGen/X86/widen_cast-1.ll =================================================================== --- llvm/test/CodeGen/X86/widen_cast-1.ll +++ llvm/test/CodeGen/X86/widen_cast-1.ll @@ -12,7 +12,6 @@ ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: cmpl $3, (%esp) ; CHECK-NEXT: jg .LBB0_3 ; CHECK-NEXT: .p2align 4, 0x90 @@ -21,10 +20,9 @@ ; CHECK-NEXT: movl (%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; CHECK-NEXT: psubw %xmm0, %xmm2 -; CHECK-NEXT: pshufb %xmm1, %xmm2 -; CHECK-NEXT: movq %xmm2, (%ecx,%eax,8) +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: psubw %xmm0, %xmm1 +; CHECK-NEXT: movq %xmm1, (%ecx,%eax,8) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) ; CHECK-NEXT: jle .LBB0_2 @@ -36,7 +34,6 @@ ; ATOM: # %bb.0: # %entry ; ATOM-NEXT: pushl %eax ; ATOM-NEXT: pcmpeqd %xmm0, %xmm0 -; ATOM-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; ATOM-NEXT: movl $0, (%esp) ; ATOM-NEXT: cmpl $3, (%esp) ; ATOM-NEXT: jg .LBB0_3 @@ -45,12 +42,10 @@ ; ATOM-NEXT: # =>This Inner Loop Header: Depth=1 ; ATOM-NEXT: movl (%esp), %eax ; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; ATOM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; ATOM-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; ATOM-NEXT: psubw %xmm0, %xmm2 -; ATOM-NEXT: pshufb %xmm1, %xmm2 -; ATOM-NEXT: movq %xmm2, (%ecx,%eax,8) +; ATOM-NEXT: psubw %xmm0, %xmm1 +; ATOM-NEXT: movq %xmm1, (%ecx,%eax,8) ; ATOM-NEXT: incl (%esp) ; ATOM-NEXT: cmpl $3, (%esp) ; ATOM-NEXT: jle .LBB0_2 Index: llvm/test/CodeGen/X86/widen_cast-2.ll =================================================================== --- llvm/test/CodeGen/X86/widen_cast-2.ll +++ llvm/test/CodeGen/X86/widen_cast-2.ll @@ -21,8 +21,9 @@ ; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2 ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: psubw %xmm0, %xmm2 +; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) +; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) ; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) -; CHECK-NEXT: movq %xmm2, 16(%ecx,%eax) ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) Index: llvm/test/CodeGen/X86/widen_cast-3.ll =================================================================== --- llvm/test/CodeGen/X86/widen_cast-3.ll +++ llvm/test/CodeGen/X86/widen_cast-3.ll @@ -11,7 +11,8 @@ ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubd %xmm1, %xmm0 ; X86-NEXT: pextrd $2, %xmm0, 8(%eax) -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: pextrd $1, %xmm0, 4(%eax) +; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert: Index: llvm/test/CodeGen/X86/widen_cast-4.ll =================================================================== --- llvm/test/CodeGen/X86/widen_cast-4.ll +++ llvm/test/CodeGen/X86/widen_cast-4.ll @@ -10,7 +10,8 @@ ; NARROW-NEXT: subl $12, %esp ; NARROW-NEXT: movl $0, (%esp) ; NARROW-NEXT: pcmpeqd %xmm0, %xmm0 -; NARROW-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; NARROW-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; NARROW-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; NARROW-NEXT: .p2align 4, 0x90 ; NARROW-NEXT: .LBB0_1: # %forcond ; NARROW-NEXT: # =>This Inner Loop Header: Depth=1 @@ -26,13 +27,13 @@ ; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp) ; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx ; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; NARROW-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; NARROW-NEXT: psubw %xmm0, %xmm2 -; NARROW-NEXT: psllw $8, %xmm2 -; NARROW-NEXT: psraw $8, %xmm2 -; NARROW-NEXT: psrlw $2, %xmm2 -; NARROW-NEXT: pshufb %xmm1, %xmm2 -; NARROW-NEXT: movq %xmm2, (%edx,%eax,8) +; NARROW-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; NARROW-NEXT: psubb %xmm0, %xmm3 +; NARROW-NEXT: psrlw $2, %xmm3 +; NARROW-NEXT: pand %xmm1, %xmm3 +; NARROW-NEXT: pxor %xmm2, %xmm3 +; NARROW-NEXT: psubb %xmm2, %xmm3 +; NARROW-NEXT: movq %xmm3, (%edx,%eax,8) ; NARROW-NEXT: incl (%esp) ; NARROW-NEXT: jmp .LBB0_1 ; NARROW-NEXT: .LBB0_3: # %afterfor Index: llvm/test/CodeGen/X86/widen_cast-5.ll =================================================================== --- llvm/test/CodeGen/X86/widen_cast-5.ll +++ llvm/test/CodeGen/X86/widen_cast-5.ll @@ -8,18 +8,15 @@ ; X86-LABEL: convert: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; X86-NEXT: pxor LCPI0_0, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: xorps LCPI0_0, %xmm0 +; X86-NEXT: movlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movq %rsi, %xmm0 -; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: pxor {{.*}}(%rip), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: llvm/test/CodeGen/X86/widen_cast-6.ll =================================================================== --- llvm/test/CodeGen/X86/widen_cast-6.ll +++ llvm/test/CodeGen/X86/widen_cast-6.ll @@ -7,9 +7,7 @@ define i32 @return_v2hi() nounwind { ; X86-LABEL: return_v2hi: ; X86: ## %bb.0: ## %entry -; X86-NEXT: pushl %eax ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: return_v2hi: Index: llvm/test/CodeGen/X86/widen_compare-1.ll =================================================================== --- llvm/test/CodeGen/X86/widen_compare-1.ll +++ llvm/test/CodeGen/X86/widen_compare-1.ll @@ -7,12 +7,12 @@ define <2 x i16> @compare_v2i64_to_v2i16_unary(<2 x i16>* %src) nounwind { ; X86-LABEL: compare_v2i64_to_v2i16_unary: ; X86: # %bb.0: -; X86-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0] +; X86-NEXT: pcmpeqd %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: compare_v2i64_to_v2i16_unary: ; X64: # %bb.0: -; X64-NEXT: movaps {{.*#+}} xmm0 = [65535,65535] +; X64-NEXT: pcmpeqd %xmm0, %xmm0 ; X64-NEXT: retq %val = load <2 x i16>, <2 x i16>* %src, align 4 %cmp = icmp uge <2 x i16> %val, %val @@ -25,20 +25,18 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X86-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X86-NEXT: pcmpgtq %xmm0, %xmm1 -; X86-NEXT: pcmpeqd %xmm0, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm0 +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pmaxuw %xmm1, %xmm0 +; X86-NEXT: pcmpeqw %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: compare_v2i64_to_v2i16_binary: ; X64: # %bb.0: -; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X64-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; X64-NEXT: pcmpgtq %xmm0, %xmm1 -; X64-NEXT: pcmpeqd %xmm0, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pmaxuw %xmm1, %xmm0 +; X64-NEXT: pcmpeqw %xmm1, %xmm0 ; X64-NEXT: retq %val0 = load <2 x i16>, <2 x i16>* %src0, align 4 %val1 = load <2 x i16>, <2 x i16>* %src1, align 4 Index: llvm/test/CodeGen/X86/widen_conv-1.ll =================================================================== --- llvm/test/CodeGen/X86/widen_conv-1.ll +++ llvm/test/CodeGen/X86/widen_conv-1.ll @@ -8,16 +8,17 @@ ; X86-LABEL: convert_v2i64_to_v2i32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i64_to_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: psubd %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -32,25 +33,23 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) nounwind { ; X86-LABEL: convert_v3i32_to_v3i8: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqa (%ecx), %xmm0 -; X86-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: pextrb $8, %xmm0, 2(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: psubb %xmm1, %xmm0 +; X86-NEXT: pextrb $2, %xmm0, 2(%eax) ; X86-NEXT: pextrw $0, %xmm0, (%eax) -; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: convert_v3i32_to_v3i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa (%rsi), %xmm0 -; X64-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-NEXT: psubd %xmm1, %xmm0 -; X64-NEXT: pextrb $8, %xmm0, 2(%rdi) ; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: psubb %xmm1, %xmm0 +; X64-NEXT: pextrb $2, %xmm0, 2(%rdi) ; X64-NEXT: pextrw $0, %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -66,29 +65,23 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind { ; X86-LABEL: convert_v5i16_to_v5i8: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqa (%ecx), %xmm0 -; X86-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-NEXT: psubw %xmm1, %xmm0 -; X86-NEXT: pextrb $8, %xmm0, 4(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: psubb %xmm1, %xmm0 +; X86-NEXT: pextrb $4, %xmm0, 4(%eax) ; X86-NEXT: movd %xmm0, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: convert_v5i16_to_v5i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa (%rsi), %xmm0 -; X64-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-NEXT: psubw %xmm1, %xmm0 -; X64-NEXT: pextrb $8, %xmm0, 4(%rdi) ; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: psubb %xmm1, %xmm0 +; X64-NEXT: pextrb $4, %xmm0, 4(%rdi) ; X64-NEXT: movd %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: llvm/test/CodeGen/X86/widen_conv-2.ll =================================================================== --- llvm/test/CodeGen/X86/widen_conv-2.ll +++ llvm/test/CodeGen/X86/widen_conv-2.ll @@ -8,17 +8,13 @@ ; X86-LABEL: convert_v2i16_v2i32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: psllq $48, %xmm0 -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: pmovsxwd %xmm0, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert_v2i16_v2i32: ; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X64-NEXT: pmovsxwd %xmm0, %xmm0 ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: llvm/test/CodeGen/X86/widen_conv-3.ll =================================================================== --- llvm/test/CodeGen/X86/widen_conv-3.ll +++ llvm/test/CodeGen/X86/widen_conv-3.ll @@ -7,24 +7,37 @@ ; sign to float v2i16 to v2f32 define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind { -; X86-LABEL: convert_v2i16_to_v2f32: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: psllq $48, %xmm0 -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-NEXT: cvtdq2ps %xmm0, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) -; X86-NEXT: retl +; X86-SSE2-LABEL: convert_v2i16_to_v2f32: +; X86-SSE2: # %bb.0: # %entry +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: retl ; -; X64-LABEL: convert_v2i16_to_v2f32: -; X64: # %bb.0: # %entry -; X64-NEXT: psllq $48, %xmm0 -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X64-NEXT: cvtdq2ps %xmm0, %xmm0 -; X64-NEXT: movlps %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE42-LABEL: convert_v2i16_to_v2f32: +; X86-SSE42: # %bb.0: # %entry +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE42-NEXT: pmovsxwd %xmm0, %xmm0 +; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 +; X86-SSE42-NEXT: movlps %xmm0, (%eax) +; X86-SSE42-NEXT: retl +; +; X64-SSE2-LABEL: convert_v2i16_to_v2f32: +; X64-SSE2: # %bb.0: # %entry +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; X64-SSE2-NEXT: movlps %xmm0, (%rdi) +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: convert_v2i16_to_v2f32: +; X64-SSE42: # %bb.0: # %entry +; X64-SSE42-NEXT: pmovsxwd %xmm0, %xmm0 +; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 +; X64-SSE42-NEXT: movlps %xmm0, (%rdi) +; X64-SSE42-NEXT: retq entry: %val = sitofp <2 x i16> %src to <2 x float> store <2 x float> %val, <2 x float>* %dst.addr, align 4 @@ -36,28 +49,19 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind { ; X86-SSE2-LABEL: convert_v3i8_to_v3f32: ; X86-SSE2: # %bb.0: # %entry -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: andl $-16, %esp -; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movl 12(%ebp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movzwl (%ecx), %edx ; X86-SSE2-NEXT: movd %edx, %xmm0 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-SSE2-NEXT: movdqa %xmm0, (%esp) -; X86-SSE2-NEXT: movl (%esp), %edx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE2-NEXT: shll $8, %edx -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: pinsrw $1, %edx, %xmm0 -; X86-SSE2-NEXT: shll $8, %esi -; X86-SSE2-NEXT: pinsrw $3, %esi, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx -; X86-SSE2-NEXT: shll $8, %ecx -; X86-SSE2-NEXT: pinsrw $5, %ecx, %xmm0 +; X86-SSE2-NEXT: movd %ecx, %xmm2 +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE2-NEXT: psrad $24, %xmm0 ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) @@ -66,47 +70,35 @@ ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax) -; X86-SSE2-NEXT: leal -4(%ebp), %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: convert_v3i8_to_v3f32: ; X86-SSE42: # %bb.0: # %entry -; X86-SSE42-NEXT: pushl %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE42-NEXT: movzbl 2(%ecx), %edx -; X86-SSE42-NEXT: movzwl (%ecx), %ecx -; X86-SSE42-NEXT: movd %ecx, %xmm0 -; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0 -; X86-SSE42-NEXT: pslld $24, %xmm0 -; X86-SSE42-NEXT: psrad $24, %xmm0 +; X86-SSE42-NEXT: movzwl (%ecx), %edx +; X86-SSE42-NEXT: movd %edx, %xmm0 +; X86-SSE42-NEXT: pinsrb $2, 2(%ecx), %xmm0 +; X86-SSE42-NEXT: pmovsxbd %xmm0, %xmm0 ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax) ; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) ; X86-SSE42-NEXT: movss %xmm0, (%eax) -; X86-SSE42-NEXT: popl %eax ; X86-SSE42-NEXT: retl ; ; X64-SSE2-LABEL: convert_v3i8_to_v3f32: ; X64-SSE2: # %bb.0: # %entry ; X64-SSE2-NEXT: movzwl (%rsi), %eax ; X64-SSE2-NEXT: movd %eax, %xmm0 -; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; X64-SSE2-NEXT: shll $8, %eax -; X64-SSE2-NEXT: pxor %xmm0, %xmm0 -; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0 -; X64-SSE2-NEXT: shll $8, %ecx -; X64-SSE2-NEXT: pinsrw $3, %ecx, %xmm0 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax -; X64-SSE2-NEXT: shll $8, %eax -; X64-SSE2-NEXT: pinsrw $5, %eax, %xmm0 +; X64-SSE2-NEXT: movd %eax, %xmm2 +; X64-SSE2-NEXT: pslld $16, %xmm2 +; X64-SSE2-NEXT: pandn %xmm2, %xmm1 +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-SSE2-NEXT: psrad $24, %xmm0 ; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, (%rdi) @@ -116,13 +108,10 @@ ; ; X64-SSE42-LABEL: convert_v3i8_to_v3f32: ; X64-SSE42: # %bb.0: # %entry -; X64-SSE42-NEXT: movzbl 2(%rsi), %eax -; X64-SSE42-NEXT: movzwl (%rsi), %ecx -; X64-SSE42-NEXT: movd %ecx, %xmm0 -; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0 -; X64-SSE42-NEXT: pslld $24, %xmm0 -; X64-SSE42-NEXT: psrad $24, %xmm0 +; X64-SSE42-NEXT: movzwl (%rsi), %eax +; X64-SSE42-NEXT: movd %eax, %xmm0 +; X64-SSE42-NEXT: pinsrb $2, 2(%rsi), %xmm0 +; X64-SSE42-NEXT: pmovsxbd %xmm0, %xmm0 ; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi) ; X64-SSE42-NEXT: movlps %xmm0, (%rdi) Index: llvm/test/CodeGen/X86/widen_conv-4.ll =================================================================== --- llvm/test/CodeGen/X86/widen_conv-4.ll +++ llvm/test/CodeGen/X86/widen_conv-4.ll @@ -28,15 +28,15 @@ ; X86-SSE42-LABEL: convert_v7i16_v7f32: ; X86-SSE42: # %bb.0: # %entry ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE42-NEXT: pxor %xmm1, %xmm1 -; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1 +; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 -; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1 -; X86-SSE42-NEXT: extractps $2, %xmm0, 24(%eax) -; X86-SSE42-NEXT: extractps $1, %xmm0, 20(%eax) -; X86-SSE42-NEXT: movups %xmm1, (%eax) -; X86-SSE42-NEXT: movss %xmm0, 16(%eax) +; X86-SSE42-NEXT: movups %xmm0, (%eax) +; X86-SSE42-NEXT: extractps $2, %xmm1, 24(%eax) +; X86-SSE42-NEXT: extractps $1, %xmm1, 20(%eax) +; X86-SSE42-NEXT: movss %xmm1, 16(%eax) ; X86-SSE42-NEXT: retl ; ; X64-SSE2-LABEL: convert_v7i16_v7f32: @@ -55,14 +55,14 @@ ; ; X64-SSE42-LABEL: convert_v7i16_v7f32: ; X64-SSE42: # %bb.0: # %entry -; X64-SSE42-NEXT: pxor %xmm1, %xmm1 -; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1 +; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 -; X64-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1 -; X64-SSE42-NEXT: extractps $2, %xmm0, 24(%rdi) -; X64-SSE42-NEXT: movlps %xmm0, 16(%rdi) -; X64-SSE42-NEXT: movups %xmm1, (%rdi) +; X64-SSE42-NEXT: movups %xmm0, (%rdi) +; X64-SSE42-NEXT: extractps $2, %xmm1, 24(%rdi) +; X64-SSE42-NEXT: movlps %xmm1, 16(%rdi) ; X64-SSE42-NEXT: retq entry: %val = uitofp <7 x i16> %src to <7 x float> @@ -75,66 +75,58 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind { ; X86-SSE2-LABEL: convert_v3i8_to_v3f32: ; X86-SSE2: # %bb.0: # %entry -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-16, %esp -; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movl 12(%ebp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movzwl (%ecx), %edx ; X86-SSE2-NEXT: movd %edx, %xmm0 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx -; X86-SSE2-NEXT: movdqa %xmm0, (%esp) -; X86-SSE2-NEXT: movzbl (%esp), %edx -; X86-SSE2-NEXT: movd %edx, %xmm0 -; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: pinsrw $2, %edx, %xmm0 -; X86-SSE2-NEXT: pinsrw $4, %ecx, %xmm0 -; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; X86-SSE2-NEXT: movd %ecx, %xmm2 +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) ; X86-SSE2-NEXT: movaps %xmm0, %xmm1 ; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: convert_v3i8_to_v3f32: ; X86-SSE42: # %bb.0: # %entry -; X86-SSE42-NEXT: pushl %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE42-NEXT: movzbl 2(%ecx), %edx -; X86-SSE42-NEXT: movzwl (%ecx), %ecx -; X86-SSE42-NEXT: movd %ecx, %xmm0 +; X86-SSE42-NEXT: movzwl (%ecx), %edx +; X86-SSE42-NEXT: movd %edx, %xmm0 +; X86-SSE42-NEXT: pinsrb $2, 2(%ecx), %xmm0 ; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0 -; X86-SSE42-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax) ; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) ; X86-SSE42-NEXT: movss %xmm0, (%eax) -; X86-SSE42-NEXT: popl %eax ; X86-SSE42-NEXT: retl ; ; X64-SSE2-LABEL: convert_v3i8_to_v3f32: ; X64-SSE2: # %bb.0: # %entry ; X64-SSE2-NEXT: movzwl (%rsi), %eax ; X64-SSE2-NEXT: movd %eax, %xmm0 -; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax -; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-SSE2-NEXT: movd %ecx, %xmm0 -; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0 -; X64-SSE2-NEXT: pinsrw $4, %eax, %xmm0 -; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; X64-SSE2-NEXT: movd %eax, %xmm2 +; X64-SSE2-NEXT: pslld $16, %xmm2 +; X64-SSE2-NEXT: pandn %xmm2, %xmm1 +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, (%rdi) ; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movss %xmm0, 8(%rdi) @@ -142,12 +134,10 @@ ; ; X64-SSE42-LABEL: convert_v3i8_to_v3f32: ; X64-SSE42: # %bb.0: # %entry -; X64-SSE42-NEXT: movzbl 2(%rsi), %eax -; X64-SSE42-NEXT: movzwl (%rsi), %ecx -; X64-SSE42-NEXT: movd %ecx, %xmm0 +; X64-SSE42-NEXT: movzwl (%rsi), %eax +; X64-SSE42-NEXT: movd %eax, %xmm0 +; X64-SSE42-NEXT: pinsrb $2, 2(%rsi), %xmm0 ; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0 -; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi) ; X64-SSE42-NEXT: movlps %xmm0, (%rdi) Index: llvm/test/CodeGen/X86/widen_load-2.ll =================================================================== --- llvm/test/CodeGen/X86/widen_load-2.ll +++ llvm/test/CodeGen/X86/widen_load-2.ll @@ -15,7 +15,8 @@ ; X86-NEXT: movdqa (%edx), %xmm0 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: pextrd $2, %xmm0, 8(%eax) -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: pextrd $1, %xmm0, 4(%eax) +; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32: @@ -39,13 +40,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 4(%edx), %xmm0 ; X86-NEXT: pinsrd $2, 8(%edx), %xmm0 -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: movq %xmm1, (%eax) +; X86-NEXT: pextrd $1, %xmm1, 4(%eax) ; X86-NEXT: pextrd $2, %xmm1, 8(%eax) +; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32_2: @@ -77,8 +81,9 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 +; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -143,31 +148,25 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { ; X86-LABEL: add3i16: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl 16(%ebp), %ecx -; X86-NEXT: movl 12(%ebp), %edx -; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: pextrw $4, %xmm1, 4(%eax) -; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pinsrw $2, 4(%edx), %xmm0 +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: pinsrw $2, 4(%ecx), %xmm1 +; X86-NEXT: paddw %xmm0, %xmm1 +; X86-NEXT: pextrw $2, %xmm1, 4(%eax) ; X86-NEXT: movd %xmm1, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i16: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-NEXT: paddd %xmm0, %xmm1 -; X64-NEXT: pextrw $4, %xmm1, 4(%rdi) -; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: paddw %xmm0, %xmm1 +; X64-NEXT: pextrw $2, %xmm1, 4(%rdi) ; X64-NEXT: movd %xmm1, (%rdi) ; X64-NEXT: retq %a = load %i16vec3, %i16vec3* %ap, align 16 @@ -216,7 +215,8 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: movq %xmm1, 16(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -280,27 +280,23 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { ; X86-LABEL: add3i8: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: pextrb $8, %xmm1, 2(%eax) -; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: paddb %xmm0, %xmm1 +; X86-NEXT: pextrb $2, %xmm1, 2(%eax) ; X86-NEXT: pextrw $0, %xmm1, (%eax) -; X86-NEXT: addl $12, %esp ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i8: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-NEXT: paddd %xmm0, %xmm1 -; X64-NEXT: pextrb $8, %xmm1, 2(%rdi) -; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: paddb %xmm0, %xmm1 +; X64-NEXT: pextrb $2, %xmm1, 2(%rdi) ; X64-NEXT: pextrw $0, %xmm1, (%rdi) ; X64-NEXT: retq %a = load %i8vec3, %i8vec3* %ap, align 16 @@ -321,10 +317,11 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddb (%ecx), %xmm0 ; X86-NEXT: paddb 16(%ecx), %xmm1 +; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) ; X86-NEXT: pextrw $6, %xmm1, 28(%eax) ; X86-NEXT: pextrb $14, %xmm1, 30(%eax) -; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -353,7 +350,6 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { ; X86-LABEL: rot: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $16, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -361,12 +357,11 @@ ; X86-NEXT: movw $-24930, (%edx) # imm = 0x9E9E ; X86-NEXT: movb $1, 2(%ecx) ; X86-NEXT: movw $257, (%ecx) # imm = 0x101 -; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: psrld $1, %xmm0 -; X86-NEXT: pextrb $8, %xmm0, 2(%eax) -; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: psrlw $1, %xmm0 +; X86-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pextrb $2, %xmm0, 2(%eax) ; X86-NEXT: pextrw $0, %xmm0, (%eax) -; X86-NEXT: addl $16, %esp ; X86-NEXT: retl $4 ; ; X64-LABEL: rot: @@ -376,10 +371,10 @@ ; X64-NEXT: movw $-24930, (%rsi) # imm = 0x9E9E ; X64-NEXT: movb $1, 2(%rdx) ; X64-NEXT: movw $257, (%rdx) # imm = 0x101 -; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-NEXT: psrld $1, %xmm0 -; X64-NEXT: pextrb $8, %xmm0, 2(%rdi) -; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: psrlw $1, %xmm0 +; X64-NEXT: pand {{.*}}(%rip), %xmm0 +; X64-NEXT: pextrb $2, %xmm0, 2(%rdi) ; X64-NEXT: pextrw $0, %xmm0, (%rdi) ; X64-NEXT: retq entry: Index: llvm/test/CodeGen/X86/widen_shuffle-1.ll =================================================================== --- llvm/test/CodeGen/X86/widen_shuffle-1.ll +++ llvm/test/CodeGen/X86/widen_shuffle-1.ll @@ -89,18 +89,12 @@ define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone { ; X86-LABEL: shuf4: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X86-NEXT: pshufb %xmm2, %xmm1 -; X86-NEXT: pshufb %xmm2, %xmm0 -; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl ; ; X64-LABEL: shuf4: ; X64: # %bb.0: -; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-NEXT: pshufb %xmm2, %xmm1 -; X64-NEXT: pshufb %xmm2, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: retq %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> ret <8 x i8> %vshuf Index: llvm/test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -389,29 +389,30 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { ; AVX-LABEL: interleaved_load_vf8_i8_stride4: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX-NEXT: vmovdqa (%rdi), %xmm1 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm3[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vpaddw %xmm0, %xmm4, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,3,2,4,5,6,7] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> @@ -888,13 +889,8 @@ define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) { ; AVX-LABEL: interleaved_store_vf8_i8_stride4: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) @@ -1017,17 +1013,17 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq %wide.vec = load <24 x i8>, <24 x i8>* %ptr %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> @@ -1041,19 +1037,15 @@ define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) { ; AVX-LABEL: interleaved_store_vf8_i8_stride3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm2, (%rdi) +; AVX-NEXT: vmovdqu %xmm1, (%rdi) ; AVX-NEXT: retq %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> %2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> Index: llvm/test/CodeGen/X86/x86-shifts.ll =================================================================== --- llvm/test/CodeGen/X86/x86-shifts.ll +++ llvm/test/CodeGen/X86/x86-shifts.ll @@ -254,16 +254,16 @@ ; X32-LABEL: shl2_other: ; X32: # %bb.0: # %entry ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psllq $2, %xmm1 -; X32-NEXT: psllq $9, %xmm0 +; X32-NEXT: pslld $2, %xmm1 +; X32-NEXT: pslld $9, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shl2_other: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psllq $2, %xmm1 -; X64-NEXT: psllq $9, %xmm0 +; X64-NEXT: pslld $2, %xmm1 +; X64-NEXT: pslld $9, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: @@ -276,19 +276,17 @@ define <2 x i32> @shr2_other(<2 x i32> %A) nounwind { ; X32-LABEL: shr2_other: ; X32: # %bb.0: # %entry -; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psrlq $8, %xmm1 -; X32-NEXT: psrlq $1, %xmm0 +; X32-NEXT: psrld $8, %xmm1 +; X32-NEXT: psrld $1, %xmm0 ; X32-NEXT: pxor %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shr2_other: ; X64: # %bb.0: # %entry -; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $8, %xmm1 -; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: psrld $8, %xmm1 +; X64-NEXT: psrld $1, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq entry: Index: llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -22,9 +22,19 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @h( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] -; CHECK-NEXT: ret <4 x i8> [[TMP2]] +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 +; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 +; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] +; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] +; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] +; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1 +; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2 +; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3 +; CHECK-NEXT: ret <4 x i8> [[INS4]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -43,9 +53,16 @@ define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @h_undef( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] -; CHECK-NEXT: ret <4 x i8> [[TMP2]] +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3 +; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 +; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] +; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] +; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] +; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1 +; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2 +; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3 +; CHECK-NEXT: ret <4 x i8> [[INS4]] ; %x0 = extractelement <4 x i8> undef, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -64,13 +81,17 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @i( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i8> [[TMP2]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i8> [[BIN_RDX]], <4 x i8> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i8> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 +; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 +; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] +; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] +; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]] +; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret i8 [[TMP3]] ; %x0 = extractelement <4 x i8> %x, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -160,38 +160,11 @@ } define void @fptosi_8f64_8i16() #0 { -; SSE-LABEL: @fptosi_8f64_8i16( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i16 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i16 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i16 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i16 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i16 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i16 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i16 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i16 -; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 -; SSE-NEXT: ret void -; -; AVX-LABEL: @fptosi_8f64_8i16( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16> -; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX-NEXT: ret void +; CHECK-LABEL: @fptosi_8f64_8i16( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16> +; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 +; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 Index: llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -238,44 +238,11 @@ ; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptoui_8f64_8i16( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i16 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i16 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i16 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i16 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i16 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i16 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i16 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i16 -; AVX256NODQ-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @fptoui_8f64_8i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16> -; AVX512-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f64_8i16( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16> -; AVX256DQ-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX256DQ-NEXT: ret void +; AVX-LABEL: @fptoui_8f64_8i16( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16> +; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 +; AVX-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 Index: llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -344,16 +344,22 @@ ; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 ; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 ; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0 -; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer ; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] ; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]] -; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]] +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1 +; ZEROTHRESH-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] ; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0 ; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 -; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2 -; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2 +; ZEROTHRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3 ; ZEROTHRESH-NEXT: ret <4 x float> [[RD]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -430,18 +436,12 @@ ; CHECK-NEXT: ret <2 x float> [[RB]] ; ; ZEROTHRESH-LABEL: @simple_select_v2( -; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1 -; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1 -; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1 +; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer +; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]] +; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 +; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1 ; ZEROTHRESH-NEXT: ret <2 x float> [[RB]] ; %c0 = extractelement <2 x i32> %c, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -916,11 +916,26 @@ } define void @sitofp_4i16_4f32() #0 { -; CHECK-LABEL: @sitofp_4i16_4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> -; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_4i16_4f32( +; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sitofp_4i16_4f32( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> +; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; AVX-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -939,12 +954,30 @@ define void @sitofp_8i16_8f32() #0 { ; SSE-LABEL: @sitofp_8i16_8f32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 +; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 +; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 +; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 +; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float +; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float +; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float +; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float +; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 +; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 +; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 +; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_8i16_8f32( @@ -982,18 +1015,54 @@ define void @sitofp_16i16_16f32() #0 { ; SSE-LABEL: @sitofp_16i16_16f32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 -; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 +; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 +; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 +; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 +; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 +; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 +; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16 +; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2 +; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4 +; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2 +; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8 +; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2 +; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4 +; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float +; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float +; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float +; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float +; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float +; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[LD8]] to float +; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[LD9]] to float +; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[LD10]] to float +; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[LD11]] to float +; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[LD12]] to float +; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[LD13]] to float +; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[LD14]] to float +; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[LD15]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 +; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 +; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 +; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32 +; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 +; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8 +; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 +; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16 +; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 +; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8 +; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i16_16f32( Index: llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -868,11 +868,26 @@ } define void @uitofp_4i16_4f32() #0 { -; CHECK-LABEL: @uitofp_4i16_4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> -; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; CHECK-NEXT: ret void +; SSE-LABEL: @uitofp_4i16_4f32( +; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 +; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float +; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float +; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @uitofp_4i16_4f32( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> +; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; AVX-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -891,12 +906,30 @@ define void @uitofp_8i16_8f32() #0 { ; SSE-LABEL: @uitofp_8i16_8f32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 +; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 +; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 +; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 +; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 +; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float +; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float +; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float +; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float +; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float +; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float +; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 +; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 +; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 +; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_8i16_8f32( @@ -934,18 +967,54 @@ define void @uitofp_16i16_16f32() #0 { ; SSE-LABEL: @uitofp_16i16_16f32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 -; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 +; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 +; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 +; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 +; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 +; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 +; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16 +; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2 +; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4 +; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2 +; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8 +; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2 +; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4 +; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2 +; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float +; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float +; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float +; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float +; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float +; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float +; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float +; SSE-NEXT: [[CVT8:%.*]] = uitofp i16 [[LD8]] to float +; SSE-NEXT: [[CVT9:%.*]] = uitofp i16 [[LD9]] to float +; SSE-NEXT: [[CVT10:%.*]] = uitofp i16 [[LD10]] to float +; SSE-NEXT: [[CVT11:%.*]] = uitofp i16 [[LD11]] to float +; SSE-NEXT: [[CVT12:%.*]] = uitofp i16 [[LD12]] to float +; SSE-NEXT: [[CVT13:%.*]] = uitofp i16 [[LD13]] to float +; SSE-NEXT: [[CVT14:%.*]] = uitofp i16 [[LD14]] to float +; SSE-NEXT: [[CVT15:%.*]] = uitofp i16 [[LD15]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 +; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 +; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 +; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32 +; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 +; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8 +; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 +; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16 +; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 +; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8 +; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_16i16_16f32(