Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2058,7 +2058,10 @@ VL0->getType(), SrcTy, VL0); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); + int VecCost = 0; + // Check if the values are candidates to demote. + if (!MinBWs.count(VL0) || VecTy != SrcVecTy) + VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -3899,10 +3902,12 @@ // in ToDemote and additional roots that require investigating in Roots. static bool collectValuesToDemote(Value *V, SmallPtrSetImpl &Expr, SmallVectorImpl &ToDemote, - SmallVectorImpl &Roots) { + SmallVectorImpl &Roots, + bool IncludeCastsToDemote) { // We can always demote constants. if (isa(V)) { - ToDemote.push_back(V); + if (IncludeCastsToDemote) + ToDemote.push_back(V); return true; } @@ -3918,9 +3923,11 @@ // seed additional demotion, we save the truncated value. case Instruction::Trunc: Roots.push_back(I->getOperand(0)); - break; + LLVM_FALLTHROUGH; case Instruction::ZExt: case Instruction::SExt: + if (!IncludeCastsToDemote) + return true; break; // We can demote certain binary operations if we can demote both of their @@ -3931,16 +3938,20 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || - !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) + if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots, + /*IncludeCastsToDemote=*/true) || + !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots, + /*IncludeCastsToDemote=*/true)) return false; break; // We can demote selects if we can demote their true and false values. case Instruction::Select: { SelectInst *SI = cast(I); - if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || - !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) + if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots, + /*IncludeCastsToDemote=*/true) || + !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots, + /*IncludeCastsToDemote=*/true)) return false; break; } @@ -3950,7 +3961,8 @@ case Instruction::PHI: { PHINode *PN = cast(I); for (Value *IncValue : PN->incoming_values()) - if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) + if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots, + /*IncludeCastsToDemote=*/true)) return false; break; } @@ -4007,9 +4019,14 @@ // additional roots that require investigating in Roots. SmallVector ToDemote; SmallVector Roots; - for (auto *Root : TreeRoot) - if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) + for (auto *Root : TreeRoot) { + // Do not include top zext/sext/trunc operations to those to be demoted, it + // produces noise cast, trunc , exctract , cast + // sequence. + if (!collectValuesToDemote(Root, Expr, ToDemote, Roots, + /*IncludeCastsToDemote=*/false)) return; + } // The maximum bit width required to represent all the values that can be // demoted without loss of precision. It would be safe to truncate the roots @@ -4087,8 +4104,10 @@ // If we can truncate the root, we must collect additional values that might // be demoted as a result. That is, those seeded by truncations we will // modify. - while (!Roots.empty()) - collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); + while (!Roots.empty()) { + collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots, + /*IncludeCastsToDemote=*/true); + } // Finally, map the values we can demote to the maximum bit with we computed. for (auto *Scalar : ToDemote) Index: test/Transforms/SLPVectorizer/X86/PR35777.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR35777.ll +++ test/Transforms/SLPVectorizer/X86/PR35777.ll @@ -16,13 +16,10 @@ ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP14]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP11]], 1 ; CHECK-NEXT: ret { i64, i64 } [[TMP17]] ; bb: Index: test/Transforms/SLPVectorizer/X86/sign-extend.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/sign-extend.ll +++ test/Transforms/SLPVectorizer/X86/sign-extend.ll @@ -4,18 +4,15 @@ define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) { ; CHECK-LABEL: @sign_extend_v_v( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i16> [[LHS:%.*]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[VECEXT]] to i32 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[CONV]], i32 0 -; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i16> [[LHS]], i32 1 -; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[VECEXT1]] to i32 -; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[CONV2]], i32 1 -; CHECK-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i16> [[LHS]], i32 2 -; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[VECEXT4]] to i32 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[CONV5]], i32 2 -; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i16> [[LHS]], i32 3 -; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[VECEXT7]] to i32 -; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[CONV8]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[VECINIT9]] ; entry: