Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4826,13 +4826,13 @@ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size() << ".\n"); - // Check that all of the parts are scalar instructions of the same type. - Instruction *I0 = dyn_cast(VL[0]); - if (!I0) + // Check that all of the parts are scalar instructions of the same type, + // we permit an alternate opcode via InstructionsState. + InstructionsState S = getSameOpcode(VL); + if (!S.Opcode) return false; - unsigned Opcode0 = I0->getOpcode(); - + Instruction *I0 = cast(S.OpValue); unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); @@ -4849,30 +4849,15 @@ for (Value *V : VL) { Type *Ty = V->getType(); if (!isValidElementType(Ty)) { - // NOTE: the following will give user internal llvm type name, which may not be useful - R.getORE()->emit([&]() { - std::string type_str; - llvm::raw_string_ostream rso(type_str); - Ty->print(rso); - return OptimizationRemarkMissed( - SV_NAME, "UnsupportedType", I0) - << "Cannot SLP vectorize list: type " - << rso.str() + " is unsupported by vectorizer"; - }); - return false; - } - Instruction *Inst = dyn_cast(V); - - if (!Inst) - return false; - if (Inst->getOpcode() != Opcode0) { + // NOTE: the following will give user internal llvm type name, which may + // not be useful. R.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "InequableTypes", I0) - << "Cannot SLP vectorize list: not all of the " - << "parts of scalar instructions are of the same type: " - << ore::NV("Instruction1Opcode", I0) << " and " - << ore::NV("Instruction2Opcode", Inst); + std::string type_str; + llvm::raw_string_ostream rso(type_str); + Ty->print(rso); + return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) + << "Cannot SLP vectorize list: type " + << rso.str() + " is unsupported by vectorizer"; }); return false; } Index: test/Transforms/SLPVectorizer/X86/alternate-fp.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -8,38 +8,9 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @fadd_fsub_v8f32( -; CHECK-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; CHECK-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3 -; CHECK-NEXT: [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4 -; CHECK-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 -; CHECK-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 -; CHECK-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 -; CHECK-NEXT: [[AB0:%.*]] = fadd float [[A0]], [[B0]] -; CHECK-NEXT: [[AB1:%.*]] = fsub float [[A1]], [[B1]] -; CHECK-NEXT: [[AB2:%.*]] = fsub float [[A2]], [[B2]] -; CHECK-NEXT: [[AB3:%.*]] = fadd float [[A3]], [[B3]] -; CHECK-NEXT: [[AB4:%.*]] = fadd float [[A4]], [[B4]] -; CHECK-NEXT: [[AB5:%.*]] = fsub float [[A5]], [[B5]] -; CHECK-NEXT: [[AB6:%.*]] = fsub float [[A6]], [[B6]] -; CHECK-NEXT: [[AB7:%.*]] = fadd float [[A7]], [[B7]] -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R7]] ; %a0 = extractelement <8 x float> %a, i32 0 @@ -78,40 +49,40 @@ } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; CHECK-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3 -; CHECK-NEXT: [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4 -; CHECK-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 -; CHECK-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 -; CHECK-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 -; CHECK-NEXT: [[AB0:%.*]] = fmul float [[A0]], [[B0]] -; CHECK-NEXT: [[AB1:%.*]] = fdiv float [[A1]], [[B1]] -; CHECK-NEXT: [[AB2:%.*]] = fdiv float [[A2]], [[B2]] -; CHECK-NEXT: [[AB3:%.*]] = fmul float [[A3]], [[B3]] -; CHECK-NEXT: [[AB4:%.*]] = fmul float [[A4]], [[B4]] -; CHECK-NEXT: [[AB5:%.*]] = fdiv float [[A5]], [[B5]] -; CHECK-NEXT: [[AB6:%.*]] = fdiv float [[A6]], [[B6]] -; CHECK-NEXT: [[AB7:%.*]] = fmul float [[A7]], [[B7]] -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP1]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R7]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> +; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]] +; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> +; SLM-NEXT: [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> +; SLM-NEXT: [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> +; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[R7]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP1]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[R7]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP1]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[R7]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -149,60 +120,9 @@ } define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { -; SSE-LABEL: @fmul_fdiv_v4f32_const( -; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; SSE-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP4]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 -; SSE-NEXT: ret <4 x float> [[R3]] -; -; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00 -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 -; SLM-NEXT: ret <4 x float> [[R3]] -; -; AVX-LABEL: @fmul_fdiv_v4f32_const( -; AVX-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; AVX-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP4]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 -; AVX-NEXT: ret <4 x float> [[R3]] -; -; AVX512-LABEL: @fmul_fdiv_v4f32_const( -; AVX512-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2 -; AVX512-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; AVX512-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP4]], i32 1 -; AVX512-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 -; AVX512-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 -; AVX512-NEXT: ret <4 x float> [[R3]] +; CHECK-LABEL: @fmul_fdiv_v4f32_const( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 Index: test/Transforms/SLPVectorizer/X86/alternate-int.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -8,31 +8,9 @@ define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: @add_sub_v8i32( -; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4 -; CHECK-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; CHECK-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; CHECK-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[AB4:%.*]] = sub i32 [[A4]], [[B4]] -; CHECK-NEXT: [[AB5:%.*]] = sub i32 [[A5]], [[B5]] -; CHECK-NEXT: [[AB6:%.*]] = sub i32 [[A6]], [[B6]] -; CHECK-NEXT: [[AB7:%.*]] = sub i32 [[A7]], [[B7]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -71,72 +49,11 @@ } define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) { -; SSE-LABEL: @add_and_v4i32( -; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 -; SSE-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 2 -; SSE-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 -; SSE-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[B]] -; SSE-NEXT: [[AB2:%.*]] = and i32 [[A2]], [[B2]] -; SSE-NEXT: [[AB3:%.*]] = and i32 [[A3]], [[B3]] -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[TMP3]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: ret <4 x i32> [[R3]] -; -; SLM-LABEL: @add_and_v4i32( -; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 -; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 -; SLM-NEXT: [[AB0:%.*]] = add i32 [[A0]], [[B0]] -; SLM-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B1]] -; SLM-NEXT: [[AB2:%.*]] = and i32 [[A2]], [[B2]] -; SLM-NEXT: [[AB3:%.*]] = and i32 [[A3]], [[B3]] -; SLM-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[R3]] -; -; AVX-LABEL: @add_and_v4i32( -; AVX-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 -; AVX-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 2 -; AVX-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 -; AVX-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[B]] -; AVX-NEXT: [[AB2:%.*]] = and i32 [[A2]], [[B2]] -; AVX-NEXT: [[AB3:%.*]] = and i32 [[A3]], [[B3]] -; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[TMP3]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX-NEXT: ret <4 x i32> [[R3]] -; -; AVX512-LABEL: @add_and_v4i32( -; AVX512-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2 -; AVX512-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 -; AVX512-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 2 -; AVX512-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 -; AVX512-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[B]] -; AVX512-NEXT: [[AB2:%.*]] = and i32 [[A2]], [[B2]] -; AVX512-NEXT: [[AB3:%.*]] = and i32 [[A3]], [[B3]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[TMP3]], i32 1 -; AVX512-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX512-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX512-NEXT: ret <4 x i32> [[R3]] +; CHECK-LABEL: @add_and_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]] +; CHECK-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R3]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -158,24 +75,42 @@ } define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: @add_mul_v4i32( -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 -; CHECK-NEXT: [[AB0:%.*]] = mul i32 [[A0]], [[B0]] -; CHECK-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B1]] -; CHECK-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B2]] -; CHECK-NEXT: [[AB3:%.*]] = mul i32 [[A3]], [[B3]] -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[R3]] +; SSE-LABEL: @add_mul_v4i32( +; SSE-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]] +; SSE-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> +; SSE-NEXT: ret <4 x i32> [[R3]] +; +; SLM-LABEL: @add_mul_v4i32( +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 +; SLM-NEXT: [[AB0:%.*]] = mul i32 [[A0]], [[B0]] +; SLM-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B2]] +; SLM-NEXT: [[AB3:%.*]] = mul i32 [[A3]], [[B3]] +; SLM-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: ret <4 x i32> [[R3]] +; +; AVX-LABEL: @add_mul_v4i32( +; AVX-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]] +; AVX-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> +; AVX-NEXT: ret <4 x i32> [[R3]] +; +; AVX512-LABEL: @add_mul_v4i32( +; AVX512-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]] +; AVX512-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> +; AVX512-NEXT: ret <4 x i32> [[R3]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -202,160 +137,38 @@ ; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 ; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 ; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 ; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 -; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] ; SSE-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] ; SSE-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; SSE-NEXT: [[AB4:%.*]] = shl i32 [[A4]], [[B4]] -; SSE-NEXT: [[AB5:%.*]] = shl i32 [[A5]], [[B5]] -; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_shl_v8i32( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; SLM-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 -; SLM-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; SLM-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; SLM-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; SLM-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SLM-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; SLM-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; SLM-NEXT: [[AB4:%.*]] = shl i32 [[A4]], [[B4]] -; SLM-NEXT: [[AB5:%.*]] = shl i32 [[A5]], [[B5]] -; SLM-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; SLM-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R7]] ; -; AVX1-LABEL: @ashr_shl_v8i32( -; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; AVX1-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 -; AVX1-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[AB4:%.*]] = shl i32 [[A4]], [[B4]] -; AVX1-NEXT: [[AB5:%.*]] = shl i32 [[A5]], [[B5]] -; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; AVX1-NEXT: ret <8 x i32> [[R7]] -; -; AVX2-LABEL: @ashr_shl_v8i32( -; AVX2-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 -; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX2-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX2-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4 -; AVX2-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] -; AVX2-NEXT: [[AB4:%.*]] = shl i32 [[A4]], [[B4]] -; AVX2-NEXT: [[AB5:%.*]] = shl i32 [[A5]], [[B5]] -; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX2-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 -; AVX2-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 -; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; AVX2-NEXT: ret <8 x i32> [[R7]] +; AVX-LABEL: @ashr_shl_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @ashr_shl_v8i32( -; AVX512-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 -; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX512-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX512-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4 -; AVX512-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] -; AVX512-NEXT: [[AB4:%.*]] = shl i32 [[A4]], [[B4]] -; AVX512-NEXT: [[AB5:%.*]] = shl i32 [[A5]], [[B5]] -; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> ; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -394,30 +207,41 @@ } define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { -; CHECK-LABEL: @ashr_shl_v8i32_const( -; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; CHECK-NEXT: [[AB4:%.*]] = shl i32 [[A4]], 3 -; CHECK-NEXT: [[AB5:%.*]] = shl i32 [[A5]], 3 -; CHECK-NEXT: [[AB6:%.*]] = shl i32 [[A6]], 3 -; CHECK-NEXT: [[AB7:%.*]] = shl i32 [[A7]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @ashr_shl_v8i32_const( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], +; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; SLM-LABEL: @ashr_shl_v8i32_const( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], +; SLM-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @ashr_shl_v8i32_const( +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], +; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @ashr_shl_v8i32_const( +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], +; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], +; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[R7]] +; +; AVX512-LABEL: @ashr_shl_v8i32_const( +; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], +; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], +; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -485,101 +309,111 @@ ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; SLM-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 -; SLM-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SLM-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SLM-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SLM-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SLM-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SLM-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] -; SLM-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] -; SLM-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] -; SLM-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] +; SLM-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] ; SLM-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SLM-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 +; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SLM-NEXT: ret <8 x i32> [[R7]] ; -; AVX-LABEL: @ashr_lshr_shl_v8i32( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; AVX-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; AVX-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; AVX-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; AVX-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; AVX-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 -; AVX-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; AVX-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; AVX-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; AVX-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] -; AVX-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] -; AVX-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] -; AVX-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] -; AVX-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; AVX-NEXT: ret <8 x i32> [[R7]] +; AVX1-LABEL: @ashr_lshr_shl_v8i32( +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 +; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 +; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 +; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] +; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] +; AVX1-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX1-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @ashr_lshr_shl_v8i32( +; AVX2-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 +; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 +; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 +; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 +; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 +; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 +; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX2-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( -; AVX512-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 2 -; AVX512-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; AVX512-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX512-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX512-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 ; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX512-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 2 -; AVX512-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; AVX512-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 -; AVX512-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 +; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] -; AVX512-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] -; AVX512-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] -; AVX512-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] -; AVX512-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 +; AVX512-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 +; AVX512-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 +; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 +; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 +; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 ; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX512-NEXT: ret <8 x i32> [[R7]] Index: test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll +++ test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll @@ -33,16 +33,6 @@ ; CHECK-NOT: add nsw <{{[0-9]+}} x i32> ; YAML: Pass: slp-vectorizer - ; YAML-NEXT: Name: InequableTypes - ; YAML-NEXT: Function: foo - ; YAML-NEXT: Args: - ; YAML-NEXT: - String: 'Cannot SLP vectorize list: not all of the ' - ; YAML-NEXT: - String: 'parts of scalar instructions are of the same type: ' - ; YAML-NEXT: - Instruction1Opcode: add - ; YAML-NEXT: - String: ' and ' - ; YAML-NEXT: - Instruction2Opcode: phi - - ; YAML: Pass: slp-vectorizer ; YAML-NEXT: Name: NotPossible ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: Index: test/Transforms/SLPVectorizer/X86/resched.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/resched.ll +++ test/Transforms/SLPVectorizer/X86/resched.ll @@ -12,56 +12,70 @@ ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 -; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 -; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 -; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 -; CHECK-NEXT: [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4 ; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4 -; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 -; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 -; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 -; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 -; CHECK-NEXT: [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 -; CHECK-NEXT: [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 ; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 ; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[SHR_I_I]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SHR_1_I_I]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SHR_2_I_I]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[SHR_3_I_I]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[SHR_4_I_I]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SHR_5_I_I]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_6_I_I]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_7_I_I]], i32 8 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_8_I_I]], i32 9 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_9_I_I]], i32 10 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[SHR_10_I_I]], i32 11 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_11_I_I]], i32 12 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[SHR_14_I_I]], i32 15 -; CHECK-NEXT: [[TMP17:%.*]] = trunc <16 x i32> [[TMP16]] to <16 x i8> -; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i8> , [[TMP17]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 +; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> +; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> , [[TMP43]] ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP18]], <16 x i8>* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void