Diff 309919

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,474 Lines • ▼ Show 20 Lines int BoUpSLP::getEntryCost(TreeEntry *E) {

unsigned ShuffleOrOp = unsigned ShuffleOrOp =

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

switch (ShuffleOrOp) { switch (ShuffleOrOp) {

case Instruction::PHI: case Instruction::PHI:

return 0; return 0;

case Instruction::ExtractValue: case Instruction::ExtractValue:

case Instruction::ExtractElement: { case Instruction::ExtractElement: {

int DeadCost = 0;

if (NeedToShuffleReuses) { if (NeedToShuffleReuses) {

unsigned Idx = 0; unsigned Idx = 0;

for (unsigned I : E->ReuseShuffleIndices) { for (unsigned I : E->ReuseShuffleIndices) {

if (ShuffleOrOp == Instruction::ExtractElement) { if (ShuffleOrOp == Instruction::ExtractElement) {

auto *IO = cast<ConstantInt>( auto *IO = cast<ConstantInt>(

cast<ExtractElementInst>(VL[I])->getIndexOperand()); cast<ExtractElementInst>(VL[I])->getIndexOperand());

Idx = IO->getZExtValue(); Idx = IO->getZExtValue();

ReuseShuffleCost -= TTI->getVectorInstrCost( ReuseShuffleCost -= TTI->getVectorInstrCost(

Show All 10 Lines case Instruction::ExtractElement: {

auto *IO = cast<ConstantInt>( auto *IO = cast<ConstantInt>(

cast<ExtractElementInst>(V)->getIndexOperand()); cast<ExtractElementInst>(V)->getIndexOperand());

Idx = IO->getZExtValue(); Idx = IO->getZExtValue();

} else { } else {

--Idx; --Idx;

} }

ReuseShuffleCost += ReuseShuffleCost +=

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);

} }

} DeadCost = ReuseShuffleCost;

anton-afanasyevUnsubmitted

Not Done

Here += looks confusing, we can just set DeadCost = ..., since DeadCost == 0.

anton-afanasyev: Here `+=` looks confusing, we can just set `DeadCost = ...`, since `DeadCost == 0`.

ABataevAuthorUnsubmitted

Done

No, DeadCost is not 0, it is ReuseShuffleCost

ABataev: No, `DeadCost` is not `0`, it is `ReuseShuffleCost`

anton-afanasyevUnsubmitted

Not Done

If NeedToShuffleReuses == false then ReuseShuffleCost == 0, so DeadCost == 0.

anton-afanasyev: If `NeedToShuffleReuses == false` then `ReuseShuffleCost == 0`, so `DeadCost == 0`.

anton-afanasyevUnsubmitted

Not Done

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);

}

+ DeadCost = ReuseShuffleCost;

+ } else if (!E->ReorderIndices.empty()) {

+ DeadCost = ...;

}

int DeadCost = ReuseShuffleCost;

anton-afanasyev:

ABataevAuthorUnsubmitted

Done

I don't get it. What do you want to see here?

ABataev: I don't get it. What do you want to see here?

anton-afanasyevUnsubmitted

Not Done

I suggest to join code below (starting from if (!NeedToShuffleReuses && !E->ReorderIndices.empty())) to this if-block:

if (NeedToShuffleReuses) {
  ...
  DeadCost = ReuseShuffleCost;
} else if (!E->ReorderIndices.empty()) {
  DeadCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}

anton-afanasyev: I suggest to join code below (starting from `if (!NeedToShuffleReuses && !E->ReorderIndices.

ABataevAuthorUnsubmitted

Done

Ok, I see

ABataev: Ok, I see

anton-afanasyevUnsubmitted

Not Done

So, you still can set DeadCost = ... instead of +=. It's simpler and points that we can get starting cost either from shuffle reuse, or from reorder indices, not both.

anton-afanasyev: So, you still can set `DeadCost = ...` instead of `+=`. It's simpler and points that we can get…

int DeadCost = ReuseShuffleCost; } else if (!E->ReorderIndices.empty()) {

if (!E->ReorderIndices.empty()) { DeadCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

// TODO: Merge this shuffle with the ReuseShuffleCost. VecTy);

DeadCost += TTI->getShuffleCost(

TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

} }

for (unsigned I = 0, E = VL.size(); I < E; ++I) { for (unsigned I = 0, E = VL.size(); I < E; ++I) {

Instruction *EI = cast<Instruction>(VL[I]); Instruction *EI = cast<Instruction>(VL[I]);

// If all users are going to be vectorized, instruction can be // If all users are going to be vectorized, instruction can be

// considered as dead. // considered as dead.

// The same, if have only one user, it will be vectorized for sure. // The same, if have only one user, it will be vectorized for sure.

if (areAllUsersVectorized(EI)) { if (areAllUsersVectorized(EI)) {

// Take credit for instruction that will become dead. // Take credit for instruction that will become dead.

▲ Show 20 Lines • Show All 209 Lines • ▼ Show 20 Lines case Instruction::Load: {

VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,

CostKind, VL0); CostKind, VL0);

} else { } else {

assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");

VecLdCost = TTI->getGatherScatterOpCost( VecLdCost = TTI->getGatherScatterOpCost(

Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),

/*VariableMask=*/false, alignment, CostKind, VL0); /*VariableMask=*/false, alignment, CostKind, VL0);

} }

if (!E->ReorderIndices.empty()) { if (!NeedToShuffleReuses && !E->ReorderIndices.empty())

// TODO: Merge this shuffle with the ReuseShuffleCost.

VecLdCost += TTI->getShuffleCost( VecLdCost += TTI->getShuffleCost(

TargetTransformInfo::SK_PermuteSingleSrc, VecTy); TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

}

LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));

return ReuseShuffleCost + VecLdCost - ScalarLdCost; return ReuseShuffleCost + VecLdCost - ScalarLdCost;

} }

case Instruction::Store: { case Instruction::Store: {

// We know that we can merge the stores. Calculate the cost. // We know that we can merge the stores. Calculate the cost.

bool IsReorder = !E->ReorderIndices.empty(); bool IsReorder = !E->ReorderIndices.empty();

auto *SI = auto *SI =

cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

Align Alignment = SI->getAlign(); Align Alignment = SI->getAlign();

int ScalarEltCost = int ScalarEltCost =

TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,

CostKind, VL0); CostKind, VL0);

if (NeedToShuffleReuses)

ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;

int VecStCost = TTI->getMemoryOpCost(Instruction::Store, int VecStCost = TTI->getMemoryOpCost(Instruction::Store,

VecTy, Alignment, 0, CostKind, VL0); VecTy, Alignment, 0, CostKind, VL0);

if (IsReorder) { if (IsReorder)

// TODO: Merge this shuffle with the ReuseShuffleCost.

VecStCost += TTI->getShuffleCost( VecStCost += TTI->getShuffleCost(

TargetTransformInfo::SK_PermuteSingleSrc, VecTy); TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

}

LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));

return ReuseShuffleCost + VecStCost - ScalarStCost; return VecStCost - ScalarStCost;

} }

case Instruction::Call: { case Instruction::Call: {

CallInst *CI = cast<CallInst>(VL0); CallInst *CI = cast<CallInst>(VL0);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

// Calculate the cost of the scalar and vector calls. // Calculate the cost of the scalar and vector calls.

IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);

int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);

▲ Show 20 Lines • Show All 527 Lines • ▼ Show 20 Lines if (!ReuseShuffleIndicies.empty()) {

if (auto *I = dyn_cast<Instruction>(Vec)) { if (auto *I = dyn_cast<Instruction>(Vec)) {

GatherSeq.insert(I); GatherSeq.insert(I);

CSEBlocks.insert(I->getParent()); CSEBlocks.insert(I->getParent());

} }

return Vec; return Vec;

} }

namespace {

/// Merges shuffle masks and emits final shuffle instruction, if required.

class ShuffleInstructionBuilder {

IRBuilderBase &Builder;

bool IsFinalized = false;

SmallVector<int, 4> Mask;

public:

ShuffleInstructionBuilder(IRBuilderBase &Builder) : Builder(Builder) {}

/// Adds a mask, inverting it before applying.

void addInversedMask(ArrayRef<unsigned> SubMask) {

if (SubMask.empty())

return;

SmallVector<int, 4> NewMask;

inversePermutation(SubMask, NewMask);

addMask(NewMask);

}

/// Functions adds masks, merging them into single one.

void addMask(ArrayRef<unsigned> SubMask) {

SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end());

addMask(NewMask);

}

void addMask(ArrayRef<int> SubMask) {

if (SubMask.empty())

return;

if (Mask.empty()) {

Mask.append(SubMask.begin(), SubMask.end());

return;

}

SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());

int TermValue = std::min(Mask.size(), SubMask.size());

for (int I = 0, E = SubMask.size(); I < E; ++I) {

if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) {

NewMask[I] = E;

continue;

}

NewMask[I] = Mask[SubMask[I]];

}

Mask.swap(NewMask);

}

anton-afanasyevUnsubmitted

Not Done

This doesn't change anything but it looks preferable for IsFinalized to be true after finalization even for empty Mask.

anton-afanasyev: This doesn't change anything but it looks preferable for `IsFinalized` to be true after…

Value *finalize(Value *V) {

IsFinalized = true;

if (Mask.empty())

return V;

return Builder.CreateShuffleVector(V, Mask, "shuffle");

}

~ShuffleInstructionBuilder() {

assert((IsFinalized || Mask.empty()) &&

"Must be finalized construction of the shuffles.");

}

};

} // namespace

Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

IRBuilder<>::InsertPointGuard Guard(Builder); IRBuilder<>::InsertPointGuard Guard(Builder);

if (E->VectorizedValue) { if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");

return E->VectorizedValue; return E->VectorizedValue;

} }

ShuffleInstructionBuilder ShuffleBuilder(Builder);

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

if (E->State == TreeEntry::NeedToGather) { if (E->State == TreeEntry::NeedToGather) {

setInsertPointAfterBundle(E); setInsertPointAfterBundle(E);

Value *Vec = gather(E->Scalars); Value *Vec = gather(E->Scalars);

if (NeedToShuffleReuses) { if (NeedToShuffleReuses) {

Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); ShuffleBuilder.addMask(E->ReuseShuffleIndices);

Vec = ShuffleBuilder.finalize(Vec);

if (auto *I = dyn_cast<Instruction>(Vec)) { if (auto *I = dyn_cast<Instruction>(Vec)) {

GatherSeq.insert(I); GatherSeq.insert(I);

CSEBlocks.insert(I->getParent()); CSEBlocks.insert(I->getParent());

} }

E->VectorizedValue = Vec; E->VectorizedValue = Vec;

return Vec; return Vec;

} }

▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines case Instruction::PHI: {

assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&

"Invalid number of incoming values"); "Invalid number of incoming values");

return V; return V;

} }

case Instruction::ExtractElement: { case Instruction::ExtractElement: {

Value *V = E->getSingleOperand(0); Value *V = E->getSingleOperand(0);

if (!E->ReorderIndices.empty()) {

SmallVector<int, 4> Mask;

inversePermutation(E->ReorderIndices, Mask);

Builder.SetInsertPoint(VL0); Builder.SetInsertPoint(VL0);

V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); ShuffleBuilder.addInversedMask(E->ReorderIndices);

} ShuffleBuilder.addMask(E->ReuseShuffleIndices);

if (NeedToShuffleReuses) { V = ShuffleBuilder.finalize(V);

// TODO: Merge this shuffle with the ReorderShuffleMask.

if (E->ReorderIndices.empty())

Builder.SetInsertPoint(VL0);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");

}

E->VectorizedValue = V; E->VectorizedValue = V;

return V; return V;

} }

case Instruction::ExtractValue: { case Instruction::ExtractValue: {

auto *LI = cast<LoadInst>(E->getSingleOperand(0)); auto *LI = cast<LoadInst>(E->getSingleOperand(0));

Builder.SetInsertPoint(LI); Builder.SetInsertPoint(LI);

auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);

LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());

Value *NewV = propagateMetadata(V, E->Scalars); Value *NewV = propagateMetadata(V, E->Scalars);

if (!E->ReorderIndices.empty()) { ShuffleBuilder.addInversedMask(E->ReorderIndices);

SmallVector<int, 4> Mask; ShuffleBuilder.addMask(E->ReuseShuffleIndices);

inversePermutation(E->ReorderIndices, Mask); NewV = ShuffleBuilder.finalize(NewV);

NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");

}

if (NeedToShuffleReuses) {

// TODO: Merge this shuffle with the ReorderShuffleMask.

NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,

"shuffle");

}

E->VectorizedValue = NewV; E->VectorizedValue = NewV;

return NewV; return NewV;

} }

case Instruction::ZExt: case Instruction::ZExt:

case Instruction::SExt: case Instruction::SExt:

case Instruction::FPToUI: case Instruction::FPToUI:

case Instruction::FPToSI: case Instruction::FPToSI:

case Instruction::FPExt: case Instruction::FPExt:

Show All 10 Lines case Instruction::BitCast: {

if (E->VectorizedValue) { if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue; return E->VectorizedValue;

} }

auto *CI = cast<CastInst>(VL0); auto *CI = cast<CastInst>(VL0);

Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::FCmp: case Instruction::FCmp:

case Instruction::ICmp: { case Instruction::ICmp: {

setInsertPointAfterBundle(E); setInsertPointAfterBundle(E);

Value *L = vectorizeTree(E->getOperand(0)); Value *L = vectorizeTree(E->getOperand(0));

Value *R = vectorizeTree(E->getOperand(1)); Value *R = vectorizeTree(E->getOperand(1));

if (E->VectorizedValue) { if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue; return E->VectorizedValue;

} }

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

Value *V = Builder.CreateCmp(P0, L, R); Value *V = Builder.CreateCmp(P0, L, R);

propagateIRFlags(V, E->Scalars, VL0); propagateIRFlags(V, E->Scalars, VL0);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::Select: { case Instruction::Select: {

setInsertPointAfterBundle(E); setInsertPointAfterBundle(E);

Value *Cond = vectorizeTree(E->getOperand(0)); Value *Cond = vectorizeTree(E->getOperand(0));

Value *True = vectorizeTree(E->getOperand(1)); Value *True = vectorizeTree(E->getOperand(1));

Value *False = vectorizeTree(E->getOperand(2)); Value *False = vectorizeTree(E->getOperand(2));

if (E->VectorizedValue) { if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue; return E->VectorizedValue;

} }

Value *V = Builder.CreateSelect(Cond, True, False); Value *V = Builder.CreateSelect(Cond, True, False);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::FNeg: { case Instruction::FNeg: {

setInsertPointAfterBundle(E); setInsertPointAfterBundle(E);

Value *Op = vectorizeTree(E->getOperand(0)); Value *Op = vectorizeTree(E->getOperand(0));

if (E->VectorizedValue) { if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue; return E->VectorizedValue;

} }

Value *V = Builder.CreateUnOp( Value *V = Builder.CreateUnOp(

static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);

propagateIRFlags(V, E->Scalars, VL0); propagateIRFlags(V, E->Scalars, VL0);

if (auto *I = dyn_cast<Instruction>(V)) if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars); V = propagateMetadata(I, E->Scalars);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::Add: case Instruction::Add:

case Instruction::FAdd: case Instruction::FAdd:

Show All 25 Lines case Instruction::Xor: {

Value *V = Builder.CreateBinOp( Value *V = Builder.CreateBinOp(

static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

RHS); RHS);

propagateIRFlags(V, E->Scalars, VL0); propagateIRFlags(V, E->Scalars, VL0);

if (auto *I = dyn_cast<Instruction>(V)) if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars); V = propagateMetadata(I, E->Scalars);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::Load: { case Instruction::Load: {

// Loads are inserted at the head of the tree because we don't want to // Loads are inserted at the head of the tree because we don't want to

Show All 25 Lines case Instruction::Load: {

Align CommonAlignment = LI->getAlign(); Align CommonAlignment = LI->getAlign();

for (Value *V : E->Scalars) for (Value *V : E->Scalars)

CommonAlignment = CommonAlignment =

commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());

NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment); NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);

} }

Value *V = propagateMetadata(NewLI, E->Scalars); Value *V = propagateMetadata(NewLI, E->Scalars);

if (IsReorder) { ShuffleBuilder.addInversedMask(E->ReorderIndices);

SmallVector<int, 4> Mask; ShuffleBuilder.addMask(E->ReuseShuffleIndices);

inversePermutation(E->ReorderIndices, Mask); V = ShuffleBuilder.finalize(V);

V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");

}

if (NeedToShuffleReuses) {

// TODO: Merge this shuffle with the ReorderShuffleMask.

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");

}

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::Store: { case Instruction::Store: {

bool IsReorder = !E->ReorderIndices.empty(); bool IsReorder = !E->ReorderIndices.empty();

auto *SI = cast<StoreInst>( auto *SI = cast<StoreInst>(

IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);

unsigned AS = SI->getPointerAddressSpace(); unsigned AS = SI->getPointerAddressSpace();

setInsertPointAfterBundle(E); setInsertPointAfterBundle(E);

Value *VecValue = vectorizeTree(E->getOperand(0)); Value *VecValue = vectorizeTree(E->getOperand(0));

if (IsReorder) { ShuffleBuilder.addMask(E->ReorderIndices);

SmallVector<int, 4> Mask(E->ReorderIndices.begin(), VecValue = ShuffleBuilder.finalize(VecValue);

E->ReorderIndices.end());

VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");

}

Value *ScalarPtr = SI->getPointerOperand(); Value *ScalarPtr = SI->getPointerOperand();

Value *VecPtr = Builder.CreateBitCast( Value *VecPtr = Builder.CreateBitCast(

ScalarPtr, VecValue->getType()->getPointerTo(AS)); ScalarPtr, VecValue->getType()->getPointerTo(AS));

StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,

SI->getAlign()); SI->getAlign());

// The pointer operand uses an in-tree scalar, so add the new BitCast to // The pointer operand uses an in-tree scalar, so add the new BitCast to

// ExternalUses to make sure that an extract will be generated in the // ExternalUses to make sure that an extract will be generated in the

// future. // future.

if (getTreeEntry(ScalarPtr)) if (getTreeEntry(ScalarPtr))

ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));

Value *V = propagateMetadata(ST, E->Scalars); Value *V = propagateMetadata(ST, E->Scalars);

if (NeedToShuffleReuses)

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");

anton-afanasyevUnsubmitted

Not Done

Why do we ignore ReuseShuffleIndices here?

anton-afanasyev: Why do we ignore `ReuseShuffleIndices` here?

ABataevAuthorUnsubmitted

Done

Because stores do not have uses, so this is actually just a dead code.

ABataev: Because stores do not have uses, so this is actually just a dead code.

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::GetElementPtr: { case Instruction::GetElementPtr: {

setInsertPointAfterBundle(E); setInsertPointAfterBundle(E);

Show All 21 Lines case Instruction::GetElementPtr: {

OpVecs.push_back(OpVec); OpVecs.push_back(OpVec);

} }

Value *V = Builder.CreateGEP( Value *V = Builder.CreateGEP(

cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);

if (Instruction *I = dyn_cast<Instruction>(V)) if (Instruction *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars); V = propagateMetadata(I, E->Scalars);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::Call: { case Instruction::Call: {

CallInst *CI = cast<CallInst>(VL0); CallInst *CI = cast<CallInst>(VL0);

▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines case Instruction::Call: {

// The scalar argument uses an in-tree scalar so we add the new vectorized // The scalar argument uses an in-tree scalar so we add the new vectorized

// call to ExternalUses list to make sure that an extract will be // call to ExternalUses list to make sure that an extract will be

// generated in the future. // generated in the future.

if (ScalarArg && getTreeEntry(ScalarArg)) if (ScalarArg && getTreeEntry(ScalarArg))

ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));

propagateIRFlags(V, E->Scalars, VL0); propagateIRFlags(V, E->Scalars, VL0);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

case Instruction::ShuffleVector: { case Instruction::ShuffleVector: {

assert(E->isAltShuffle() && assert(E->isAltShuffle() &&

((Instruction::isBinaryOp(E->getOpcode()) && ((Instruction::isBinaryOp(E->getOpcode()) &&

▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines case Instruction::ShuffleVector: {

} }

propagateIRFlags(V0, OpScalars); propagateIRFlags(V0, OpScalars);

propagateIRFlags(V1, AltScalars); propagateIRFlags(V1, AltScalars);

Value *V = Builder.CreateShuffleVector(V0, V1, Mask); Value *V = Builder.CreateShuffleVector(V0, V1, Mask);

if (Instruction *I = dyn_cast<Instruction>(V)) if (Instruction *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars); V = propagateMetadata(I, E->Scalars);

if (NeedToShuffleReuses) ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V; E->VectorizedValue = V;

++NumVectorInstructions; ++NumVectorInstructions;

return V; return V;

} }

default: default:

llvm_unreachable("unknown inst"); llvm_unreachable("unknown inst");

▲ Show 20 Lines • Show All 3,078 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -slp-vectorizer -S -mtriple=aarch64-apple-ios -mcpu=cyclone -o - %s \| FileCheck %s			; RUN: opt -slp-vectorizer -S -mtriple=aarch64-apple-ios -mcpu=cyclone -o - %s \| FileCheck %s

	define void @f1(<2 x i16> %x, i16* %a) {			define void @f1(<2 x i16> %x, i16* %a) {
	; CHECK-LABEL: @f1(			; CHECK-LABEL: @f1(
	; CHECK-NEXT: [[SHUFFLE:%.]] = shufflevector <2 x i16> [[X:%.]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>			; CHECK-NEXT: [[SHUFFLE:%.]] = shufflevector <2 x i16> [[X:%.]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
	; CHECK-NEXT: [[PTR0:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 0			; CHECK-NEXT: [[PTR0:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 0
	; CHECK-NEXT: [[PTR1:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 1			; CHECK-NEXT: [[PTR1:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 1
	; CHECK-NEXT: [[PTR2:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 2			; CHECK-NEXT: [[PTR2:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 2
	; CHECK-NEXT: [[PTR3:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 3			; CHECK-NEXT: [[PTR3:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 3
	; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0			; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
	; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2			; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2
				RKSimonUnsubmitted Not Done Reply Inline Actions These align changes look like they should already be there - regenerate the base test file and then rebase? RKSimon: These align changes look like they should already be there - regenerate the base test file and…
	; CHECK-NEXT: [[TMP2:%.]] = bitcast i16 [[PTR0]] to <4 x i16>*			; CHECK-NEXT: [[TMP2:%.]] = bitcast i16 [[PTR0]] to <4 x i16>*
	; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2			; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%t2 = extractelement <2 x i16> %x, i32 0			%t2 = extractelement <2 x i16> %x, i32 0
	%t3 = extractelement <2 x i16> %x, i32 1			%t3 = extractelement <2 x i16> %x, i32 1
	%ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0			%ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
	%ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1			%ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
	▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines

	define void @f3(<2 x i16> %x, i16* %a) {			define void @f3(<2 x i16> %x, i16* %a) {
	; CHECK-LABEL: @f3(			; CHECK-LABEL: @f3(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[CONT:%.*]]			; CHECK-NEXT: br label [[CONT:%.*]]
	; CHECK: cont:			; CHECK: cont:
	; CHECK-NEXT: [[XX:%.]] = phi <2 x i16> [ [[X:%.]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]			; CHECK-NEXT: [[XX:%.]] = phi <2 x i16> [ [[X:%.]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
	; CHECK-NEXT: [[AA:%.]] = phi i16 [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]			; CHECK-NEXT: [[AA:%.]] = phi i16 [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
	; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> <i32 1, i32 0>			; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
	; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
	; CHECK-NEXT: [[PTR0:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 0			; CHECK-NEXT: [[PTR0:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 0
	; CHECK-NEXT: [[PTR1:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 1			; CHECK-NEXT: [[PTR1:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 1
	; CHECK-NEXT: [[PTR2:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 2			; CHECK-NEXT: [[PTR2:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 2
	; CHECK-NEXT: [[PTR3:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 3			; CHECK-NEXT: [[PTR3:%.]] = getelementptr inbounds [4 x i16], [4 x i16] undef, i16 0, i16 3
	; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0			; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
	; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2			; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2
	; CHECK-NEXT: [[TMP1:%.]] = bitcast i16 [[PTR0]] to <4 x i16>*			; CHECK-NEXT: [[TMP1:%.]] = bitcast i16 [[PTR0]] to <4 x i16>*
	; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2			; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
	Show All 30 Lines

llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll

Show All 29 Lines	;
ret void		ret void
}		}

define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {		define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
; CHECK-LABEL: @i64_simplifiedi_reversed(		; CHECK-LABEL: @i64_simplifiedi_reversed(
; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds i64, i64 [[LD:%.*]], i64 1		; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds i64, i64 [[LD:%.*]], i64 1
; CHECK-NEXT: [[TMP1:%.]] = bitcast i64 [[LD]] to <2 x i64>*		; CHECK-NEXT: [[TMP1:%.]] = bitcast i64 [[LD]] to <2 x i64>*
; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> [[TMP1]], align 8		; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> [[TMP1]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>		; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[ARRAYIDX3:%.]] = getelementptr inbounds i64, i64 [[ST:%.*]], i64 1		; CHECK-NEXT: [[ARRAYIDX3:%.]] = getelementptr inbounds i64, i64 [[ST:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[ST]], i64 2		; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[ST]], i64 2
; CHECK-NEXT: [[ARRAYIDX5:%.]] = getelementptr inbounds i64, i64 [[ST]], i64 3		; CHECK-NEXT: [[ARRAYIDX5:%.]] = getelementptr inbounds i64, i64 [[ST]], i64 3
; CHECK-NEXT: [[TMP4:%.]] = bitcast i64 [[ST]] to <4 x i64>*		; CHECK-NEXT: [[TMP3:%.]] = bitcast i64 [[ST]] to <4 x i64>*
; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8		; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1		%arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1

%t0 = load i64, i64* %ld, align 8		%t0 = load i64, i64* %ld, align 8
%t1 = load i64, i64* %arrayidx1, align 8		%t1 = load i64, i64* %arrayidx1, align 8

%arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1		%arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1
▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Merge reorder and reuse shuffles.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 309919

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll

llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Merge reorder and reuse shuffles.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 309919

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll

llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll

[SLP]Merge reorder and reuse shuffles.
ClosedPublic