diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3621,6 +3621,27 @@ return Cost; } +/// Shuffles \p Mask in accordance with the given \p SubMask. +static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask) { + if (SubMask.empty()) + return; + if (Mask.empty()) { + Mask.append(SubMask.begin(), SubMask.end()); + return; + } + SmallVector NewMask(SubMask.size(), SubMask.size()); + int TermValue = std::min(Mask.size(), SubMask.size()); + for (int I = 0, E = SubMask.size(); I < E; ++I) { + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || + Mask[SubMask[I]] >= TermValue) { + NewMask[I] = UndefMaskElem; + continue; + } + NewMask[I] = Mask[SubMask[I]]; + } + Mask.swap(NewMask); +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -3633,6 +3654,7 @@ else if (auto *IE = dyn_cast(VL[0])) ScalarTy = IE->getOperand(1)->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + auto *FinalVecTy = VecTy; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If we have computed a smaller type for the expression, update VecTy so @@ -3643,12 +3665,9 @@ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) { - ReuseShuffleCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, - E->ReuseShuffleIndices); - } + if (NeedToShuffleReuses) + FinalVecTy = + FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers); // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, @@ -3737,6 +3756,10 @@ dbgs() << "SLP: perfect diamond match for gather bundle that starts with " << *VL.front() << ".\n"); + if (NeedToShuffleReuses) + GatherCost = + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); } else { LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() << " entries for bundle that starts with " @@ -3744,16 +3767,15 @@ // Detected that instead of gather we can emit a shuffle of single/two // previously vectorized nodes. Add the cost of the permutation rather // than gather. - GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask); + ::addMask(Mask, E->ReuseShuffleIndices); + GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); } - return ReuseShuffleCost + GatherCost; + return GatherCost; } if (isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as the // broadcast. - return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None, - 0); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL) && @@ -3771,11 +3793,36 @@ InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); AdjustExtractsCost(Cost, /*IsGather=*/true); - return ReuseShuffleCost + Cost; + if (NeedToShuffleReuses) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); + return Cost; } } + InstructionCost ReuseShuffleCost = 0; + if (NeedToShuffleReuses) + ReuseShuffleCost = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return ReuseShuffleCost + getGatherCost(VL); } + InstructionCost CommonCost = 0; + SmallVector Mask; + if (!E->ReorderIndices.empty()) { + SmallVector NewMask; + if (E->getOpcode() == Instruction::Store) { + // For stores the order is actually a mask. + NewMask.resize(E->ReorderIndices.size()); + copy(E->ReorderIndices, NewMask.begin()); + } else { + inversePermutation(E->ReorderIndices, NewMask); + } + ::addMask(Mask, NewMask); + } + if (NeedToShuffleReuses) + ::addMask(Mask, E->ReuseShuffleIndices); + if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) + CommonCost = + TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); @@ -3797,12 +3844,12 @@ for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(VL[I]); - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); ++Idx; } } @@ -3810,21 +3857,15 @@ for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(V); - ReuseShuffleCost += TTI->getVectorInstrCost( - Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { --Idx; - ReuseShuffleCost += TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); } } - CommonCost = ReuseShuffleCost; - } else if (!E->ReorderIndices.empty()) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - CommonCost = TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); } if (ShuffleOrOp == Instruction::ExtractValue) { for (unsigned I = 0, E = VL.size(); I < E; ++I) { @@ -3915,7 +3956,7 @@ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. @@ -3925,12 +3966,11 @@ InstructionCost VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = - ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); + VecCost = CommonCost + TTI->getCastInstrCost( + E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -3941,7 +3981,7 @@ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; @@ -3982,8 +4022,8 @@ CmpInst::BAD_ICMP_PREDICATE, CostKind); VecCost = std::min(VecCost, IntrinsicCost); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::FNeg: case Instruction::Add: @@ -4046,14 +4086,14 @@ TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -4064,13 +4104,13 @@ InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost( Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. @@ -4078,7 +4118,7 @@ InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecLdCost; @@ -4095,14 +4135,8 @@ Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), /*VariableMask=*/false, Alignment, CostKind, VL0); } - if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); - return ReuseShuffleCost + VecLdCost - ScalarLdCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); + return CommonCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. @@ -4115,14 +4149,8 @@ InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecStCost = TTI->getMemoryOpCost( Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - if (IsReorder) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - VecStCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); - return VecStCost - ScalarStCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); + return CommonCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); @@ -4133,7 +4161,7 @@ InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; @@ -4145,7 +4173,7 @@ << " (" << VecCallCost << "-" << ScalarCallCost << ")" << " for " << *CI << "\n"); - return ReuseShuffleCost + VecCallCost - ScalarCallCost; + return CommonCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { assert(E->isAltShuffle() && @@ -4158,11 +4186,11 @@ if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast(VL[Idx]); - ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); + CommonCost -= TTI->getInstructionCost(I, CostKind); } for (Value *V : VL) { Instruction *I = cast(V); - ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); + CommonCost += TTI->getInstructionCost(I, CostKind); } } for (Value *V : VL) { @@ -4196,8 +4224,8 @@ } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); @@ -4929,25 +4957,7 @@ addMask(NewMask); } - void addMask(ArrayRef SubMask) { - if (SubMask.empty()) - return; - if (Mask.empty()) { - Mask.append(SubMask.begin(), SubMask.end()); - return; - } - SmallVector NewMask(SubMask.size(), SubMask.size()); - int TermValue = std::min(Mask.size(), SubMask.size()); - for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) { - NewMask[I] = UndefMaskElem; - continue; - } - NewMask[I] = Mask[SubMask[I]]; - } - Mask.swap(NewMask); - } + void addMask(ArrayRef SubMask) { ::addMask(Mask, SubMask); } Value *finalize(Value *V) { IsFinalized = true; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -272,18 +272,21 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[V2:%.*]], align 4 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR0]], align 16 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR1]], align 4 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR2]], align 8 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR3]], align 4 ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR4]], align 16 ; CHECK-NEXT: [[PTR5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 5 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR5]], align 4 ; CHECK-NEXT: [[PTR6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 6 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR6]], align 8 ; CHECK-NEXT: [[PTR7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[PTR0]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP5]], align 16 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR7]], align 4 ; CHECK-NEXT: ret void ; %1 = load i32, i32* %v1, align 4