diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2871,7 +2871,7 @@ return; } - LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-vectorized loads.\n"); BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -3132,12 +3132,20 @@ } return; } + // Vectorizing non-consecutive stores with `llvm.masked.scatter`. + TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG( + dbgs() << "SLP: added a vector of non-consecutive stores.\n"); + return; } BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-vectorized store.\n"); return; } case Instruction::Call: { @@ -3704,23 +3712,26 @@ } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - Align alignment = cast(VL0)->getAlign(); - int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, - CostKind, VL0); + Align Alignment = cast(VL0)->getAlign(); + int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, + Alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; int VecLdCost; if (E->State == TreeEntry::Vectorize) { - VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, CostKind, VL0); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + Align CommonAlignment = Alignment; + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, alignment, CostKind, VL0); + /*VariableMask=*/false, CommonAlignment, CostKind, VL0); } if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. @@ -3741,8 +3752,21 @@ if (NeedToShuffleReuses) ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, Alignment, 0, CostKind, VL0); + int VecStCost; + if (E->State == TreeEntry::Vectorize) { + VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, + 0, CostKind, VL0); + } else { + assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); + Align CommonAlignment = Alignment; + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + VecStCost = TTI->getGatherScatterOpCost( + Instruction::Store, VecTy, + cast(VL0)->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind, VL0); + } if (IsReorder) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecStCost += TTI->getShuffleCost( @@ -4591,25 +4615,37 @@ setInsertPointAfterBundle(E); + Instruction *NewSI; Value *VecValue = vectorizeTree(E->getOperand(0)); if (IsReorder) { SmallVector Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); } - Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast( - ScalarPtr, VecValue->getType()->getPointerTo(AS)); - StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, - SI->getAlign()); - - // The pointer operand uses an in-tree scalar, so add the new BitCast to - // ExternalUses to make sure that an extract will be generated in the - // future. - if (getTreeEntry(ScalarPtr)) - ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); + if (E->State == TreeEntry::Vectorize) { + Value *ScalarPtr = SI->getPointerOperand(); + Value *VecPtr = Builder.CreateBitCast( + ScalarPtr, VecValue->getType()->getPointerTo(AS)); + NewSI = Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign()); + + // The pointer operand uses an in-tree scalar, so add the new BitCast to + // ExternalUses to make sure that an extract will be generated in the + // future. + if (getTreeEntry(ScalarPtr)) + ExternalUses.push_back( + ExternalUser(ScalarPtr, cast(VecPtr), 0)); + } else { + assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); + Value *VecPtr = vectorizeTree(E->getOperand(1)); + // Use the minimum alignment of the scattered stores. + Align CommonAlignment = SI->getAlign(); + for (Value *V : E->Scalars) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + NewSI = Builder.CreateMaskedScatter(VecValue, VecPtr, CommonAlignment); + } - Value *V = propagateMetadata(ST, E->Scalars); + Value *V = propagateMetadata(NewSI, E->Scalars); if (NeedToShuffleReuses) V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");