Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -845,8 +845,9 @@ // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(SrcVTy); + auto VT = TLI->getValueType(SrcVTy); unsigned Cost = 0; - if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && + if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -937,7 +937,7 @@ /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - unsigned getWidestType(); + unsigned getNarrowestType(); /// \return The most profitable unroll factor. /// If UserUF is non-zero then this method finds the best unroll-factor @@ -4471,15 +4471,15 @@ unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + unsigned NarrowestType = getNarrowestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); - unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + unsigned MaxVectorSize = WidestRegister / NarrowestType; + DEBUG(dbgs() << "LV: The Narrowest type: " << NarrowestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4568,8 +4568,8 @@ return Factor; } -unsigned LoopVectorizationCostModel::getWidestType() { - unsigned MaxWidth = 8; +unsigned LoopVectorizationCostModel::getNarrowestType() { + unsigned MinWidth = 1024; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); // For each block. @@ -4604,12 +4604,12 @@ if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) continue; - MaxWidth = std::max(MaxWidth, + MinWidth = std::min(MinWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return MaxWidth; + return MinWidth; } unsigned Index: test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll =================================================================== --- test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll +++ test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll @@ -9,7 +9,7 @@ ; If we need to scalarize the fptoui and then use inserts to build up the ; vector again, then there is certainly no value in going 256-bit wide. -; CHECK-NOT: vpinsrd +; CHECK: vpinsrd define void @convert() { entry: Index: test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll =================================================================== --- test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll +++ test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -11,13 +11,13 @@ @q = global [2048 x i16] zeroinitializer, align 16 @r = global [2048 x i16] zeroinitializer, align 16 -; Tests for widest type +; Tests for narrowest type ; Ensure that we count the pointer store in the first test case. We have a ; consecutive vector of pointers store, therefore we should count it towards the ; widest vector count. ; ; CHECK: test_consecutive_store -; CHECK: The Widest type: 64 bits +; CHECK: The Narrowest type: 64 bits define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 { %4 = load %0*, %0** %2, align 8 %5 = icmp eq %0** %0, %1 @@ -43,7 +43,7 @@ ; However, if the store of a set of pointers is not to consecutive memory we do ; NOT count the store towards the widest vector type. ; In the test case below we add i16 types to store it in an array of pointer, -; therefore the widest type should be i16. +; therefore the narrowest type should be i16. ; int* p[2048][8]; ; short q[2048]; ; for (int y = 0; y < 8; ++y) @@ -51,7 +51,7 @@ ; p[i][y] = (int*) (1 + q[i]); ; } ; CHECK: test_nonconsecutive_store -; CHECK: The Widest type: 16 bits +; CHECK: The Narrowest type: 16 bits define void @test_nonconsecutive_store() nounwind ssp uwtable { br label %1 @@ -93,7 +93,7 @@ ;; Now we check the same rules for loads. We should take consecutive loads of ;; pointer types into account. ; CHECK: test_consecutive_ptr_load -; CHECK: The Widest type: 64 bits +; CHECK: The Narrowest type: 8 bits define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { br label %1 @@ -117,7 +117,7 @@ ;; However, we should not take unconsecutive loads of pointers into account. ; CHECK: test_nonconsecutive_ptr_load -; CHECK: The Widest type: 16 bits +; CHECK: The Narrowest type: 16 bits define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable { br label %1