Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2264,33 +2264,30 @@ TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_None; + TargetTransformInfo::OP_PowerOf2; // If all operands are exactly the same ConstantInt then set the // operand kind to OK_UniformConstantValue. // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; + ConstantInt *CInt0 = nullptr; for (unsigned i = 0; i < VL.size(); ++i) { const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { + ConstantInt *CInt = dyn_cast(I->getOperand(1)); + if (!CInt) { Op2VK = TargetTransformInfo::OK_AnyValue; + Op2VP = TargetTransformInfo::OP_None; break; } - if (i == 0) { - CInt = cast(I->getOperand(1)); + if (!CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_None; + if (i == 0 || CInt0 == CInt) { + CInt0 = CInt; continue; } - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } SmallVector Operands(VL0->operand_values()); if (NeedToShuffleReuses) { Index: test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/powof2div.ll +++ test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -58,38 +58,59 @@ } define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ -; CHECK-LABEL: @powof2div_nonuniform( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 -; CHECK-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; CHECK-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 -; CHECK-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 -; CHECK-NEXT: ret void +; AVX1-LABEL: @powof2div_nonuniform( +; AVX1-NEXT: entry: +; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 +; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 +; AVX1-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 +; AVX1-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 +; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 +; AVX1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] +; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; AVX1-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; AVX1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 +; AVX1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 +; AVX1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 +; AVX1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] +; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 +; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; AVX1-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 +; AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; AVX1-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 +; AVX1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; AVX1-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 +; AVX1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] +; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 +; AVX1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; AVX1-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @powof2div_nonuniform( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; AVX2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; AVX2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 +; AVX2-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; AVX2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; AVX2-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; AVX2-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; AVX2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] +; AVX2-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], +; AVX2-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 +; AVX2-NEXT: ret void ; entry: %0 = load i32, i32* %b, align 4