diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3175,10 +3175,42 @@ const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); + SmallVector ReuseShuffleIndicies; + SmallVector UniqueValues; + auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, + &UserTreeIdx, + this](const InstructionsState &S) { + // Check that every instruction appears once in this bundle. + DenseMap UniquePositions; + for (Value *V : VL) { + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndicies.emplace_back(isa(V) ? -1 + : Res.first->second); + if (Res.second) + UniqueValues.emplace_back(V); + } + size_t NumUniqueScalarValues = UniqueValues.size(); + if (NumUniqueScalarValues == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + if (NumUniqueScalarValues <= 1 || + !llvm::isPowerOf2_32(NumUniqueScalarValues)) { + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return false; + } + VL = UniqueValues; + } + return true; + }; + InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -3187,7 +3219,9 @@ isa( cast(S.OpValue)->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -3209,7 +3243,9 @@ // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -3231,7 +3267,9 @@ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -3250,7 +3288,9 @@ if (getTreeEntry(I)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -3261,7 +3301,9 @@ for (Value *V : VL) { if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -3280,28 +3322,8 @@ } // Check that every instruction appears once in this bundle. - SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - DenseMap UniquePositions; - for (Value *V : VL) { - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) - UniqueValues.emplace_back(V); - } - size_t NumUniqueScalarValues = UniqueValues.size(); - if (NumUniqueScalarValues == VL.size()) { - ReuseShuffleIndicies.clear(); - } else { - LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || - !llvm::isPowerOf2_32(NumUniqueScalarValues)) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } - VL = UniqueValues; - } + if (!TryToFindDuplicates(S)) + return; auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -8911,6 +8933,52 @@ return OpsChanged; } +template +static bool +tryToVectorizeSequence(SmallVectorImpl &Incoming, unsigned MinVF, + function_ref Comparator, + function_ref AreCompatible, + function_ref)> TryToVectorize) { + bool Changed = false; + // Sort by type, parent, operands. + stable_sort(Incoming, Comparator); + + // Try to vectorize elements base on their type. + SmallVector Candidates; + for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) { + // Look for the next elements with the same type, parent and operand + // kinds. + auto *SameTypeIt = IncIt; + while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt)) + ++SameTypeIt; + + // Try to vectorize them. + unsigned NumElts = (SameTypeIt - IncIt); + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" + << NumElts << ")\n"); + if (NumElts > 1 && TryToVectorize(makeArrayRef(IncIt, NumElts))) { + // Success start over because instructions might have been changed. + Changed = true; + } else if (NumElts < MinVF && + (Candidates.empty() || + Candidates.front()->getType() == (*IncIt)->getType())) { + Candidates.append(IncIt, std::next(IncIt, NumElts)); + } + // Final attempt to vectorize instructions with the same types. + if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { + if (Candidates.size() > 1 && TryToVectorize(Candidates)) { + // Success start over because instructions might have been changed. + Changed = true; + } + Candidates.clear(); + } + + // Start over at the next instruction of a different type (or the end). + IncIt = SameTypeIt; + } + return Changed; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector Incoming; @@ -8919,11 +8987,85 @@ // node. Allows better to identify the chains that can be vectorized in the // better way. DenseMap> PHIToOpcodes; + auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { + assert(isValidElementType(V1->getType()) && + isValidElementType(V2->getType()) && + "Expected vectorizable types only."); + // It is fine to compare type IDs here, since we expect only vectorizable + // types, like ints, floats and pointers, we don't care about other type. + if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) + return true; + if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) + return false; + ArrayRef Opcodes1 = PHIToOpcodes[V1]; + ArrayRef Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() < Opcodes2.size()) + return true; + if (Opcodes1.size() > Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa(Opcodes1[I]) || isa(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast(Opcodes1[I])) + if (auto *I2 = dyn_cast(Opcodes2[I])) { + DomTreeNodeBase *NodeI1 = DT->getNode(I1->getParent()); + DomTreeNodeBase *NodeI2 = DT->getNode(I2->getParent()); + if (!NodeI1) + return NodeI2 != nullptr; + if (!NodeI2) + return false; + assert((NodeI1 == NodeI2) == + (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeI1 != NodeI2) + return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return I1->getOpcode() < I2->getOpcode(); + } + if (isa(Opcodes1[I]) && isa(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) + return true; + if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) + return false; + } + return false; + }; + auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { + if (V1 == V2) + return true; + if (V1->getType() != V2->getType()) + return false; + ArrayRef Opcodes1 = PHIToOpcodes[V1]; + ArrayRef Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() != Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa(Opcodes1[I]) || isa(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast(Opcodes1[I])) + if (auto *I2 = dyn_cast(Opcodes2[I])) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + if (isa(Opcodes1[I]) && isa(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) + return false; + } + return true; + }; - bool HaveVectorizedPhiNodes = true; - while (HaveVectorizedPhiNodes) { - HaveVectorizedPhiNodes = false; - + bool HaveVectorizedPhiNodes = false; + do { // Collect the incoming values from the PHIs. Incoming.clear(); for (Instruction &I : *BB) { @@ -8961,130 +9103,14 @@ } } - // Sort by type, parent, operands. - stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) { - assert(isValidElementType(V1->getType()) && - isValidElementType(V2->getType()) && - "Expected vectorizable types only."); - // It is fine to compare type IDs here, since we expect only vectorizable - // types, like ints, floats and pointers, we don't care about other type. - if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) - return true; - if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) - return false; - ArrayRef Opcodes1 = PHIToOpcodes[V1]; - ArrayRef Opcodes2 = PHIToOpcodes[V2]; - if (Opcodes1.size() < Opcodes2.size()) - return true; - if (Opcodes1.size() > Opcodes2.size()) - return false; - for (int I = 0, E = Opcodes1.size(); I < E; ++I) { - // Undefs are compatible with any other value. - if (isa(Opcodes1[I]) || isa(Opcodes2[I])) - continue; - if (auto *I1 = dyn_cast(Opcodes1[I])) - if (auto *I2 = dyn_cast(Opcodes2[I])) { - DomTreeNodeBase *NodeI1 = DT->getNode(I1->getParent()); - DomTreeNodeBase *NodeI2 = DT->getNode(I2->getParent()); - if (!NodeI1) - return NodeI2 != nullptr; - if (!NodeI2) - return false; - assert((NodeI1 == NodeI2) == - (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && - "Different nodes should have different DFS numbers"); - if (NodeI1 != NodeI2) - return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}); - if (S.getOpcode()) - continue; - return I1->getOpcode() < I2->getOpcode(); - } - if (isa(Opcodes1[I]) && isa(Opcodes2[I])) - continue; - if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) - return true; - if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) - return false; - } - return false; - }); - - auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { - if (V1 == V2) - return true; - if (V1->getType() != V2->getType()) - return false; - ArrayRef Opcodes1 = PHIToOpcodes[V1]; - ArrayRef Opcodes2 = PHIToOpcodes[V2]; - if (Opcodes1.size() != Opcodes2.size()) - return false; - for (int I = 0, E = Opcodes1.size(); I < E; ++I) { - // Undefs are compatible with any other value. - if (isa(Opcodes1[I]) || isa(Opcodes2[I])) - continue; - if (auto *I1 = dyn_cast(Opcodes1[I])) - if (auto *I2 = dyn_cast(Opcodes2[I])) { - if (I1->getParent() != I2->getParent()) - return false; - InstructionsState S = getSameOpcode({I1, I2}); - if (S.getOpcode()) - continue; - return false; - } - if (isa(Opcodes1[I]) && isa(Opcodes2[I])) - continue; - if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) - return false; - } - return true; - }; - - // Try to vectorize elements base on their type. - SmallVector Candidates; - for (SmallVector::iterator IncIt = Incoming.begin(), - E = Incoming.end(); - IncIt != E;) { - - // Look for the next elements with the same type, parent and operand - // kinds. - SmallVector::iterator SameTypeIt = IncIt; - while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) { - VisitedInstrs.insert(*SameTypeIt); - ++SameTypeIt; - } - - // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs (" - << NumElts << ")\n"); - // The order in which the phi nodes appear in the program does not matter. - // So allow tryToVectorizeList to reorder them if it is beneficial. This - // is done when there are exactly two elements since tryToVectorizeList - // asserts that there are only two values when AllowReorder is true. - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { - // Success start over because instructions might have been changed. - HaveVectorizedPhiNodes = true; - Changed = true; - } else if (NumElts < 4 && - (Candidates.empty() || - Candidates.front()->getType() == (*IncIt)->getType())) { - Candidates.append(IncIt, std::next(IncIt, NumElts)); - } - // Final attempt to vectorize phis with the same types. - if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { - if (Candidates.size() > 1 && tryToVectorizeList(Candidates, R)) { - // Success start over because instructions might have been changed. - HaveVectorizedPhiNodes = true; - Changed = true; - } - Candidates.clear(); - } - - // Start over at the next instruction of a different type (or the end). - IncIt = SameTypeIt; - } - } + HaveVectorizedPhiNodes = tryToVectorizeSequence( + Incoming, 4, PHICompare, AreCompatiblePHIs, + [this, &R](ArrayRef Candidates) { + return tryToVectorizeList(Candidates, R); + }); + Changed |= HaveVectorizedPhiNodes; + VisitedInstrs.insert(Incoming.begin(), Incoming.end()); + } while (HaveVectorizedPhiNodes); VisitedInstrs.clear(); @@ -9346,33 +9372,16 @@ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Pair.second.size() << ".\n"); - stable_sort(Pair.second, StoreSorter); - - // Try to vectorize elements based on their compatibility. - for (ArrayRef::iterator IncIt = Pair.second.begin(), - E = Pair.second.end(); - IncIt != E;) { - - // Look for the next elements with the same type. - ArrayRef::iterator SameTypeIt = IncIt; - Type *EltTy = (*IncIt)->getPointerOperand()->getType(); - - while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt)) - ++SameTypeIt; - - // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores (" - << NumElts << ")\n"); - if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() && - vectorizeStores(makeArrayRef(IncIt, NumElts), R)) { - // Success start over because instructions might have been changed. - Changed = true; - } + if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) + continue; - // Start over at the next instruction of a different type (or the end). - IncIt = SameTypeIt; - } + unsigned EltSize = R.getVectorElementSize(Pair.second.front()); + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize); + Changed |= tryToVectorizeSequence( + Pair.second, MinVF, StoreSorter, AreCompatibleStores, + [this, &R](ArrayRef Candidates) { + return vectorizeStores(Candidates, R); + }); } return Changed; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -16,38 +16,13 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; SSE-LABEL: @splat( -; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]] -; SSE-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 -; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1 -; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1 -; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1 -; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]] -; SSE-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1 -; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]] -; SSE-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1 -; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]] -; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1 -; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] -; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1 -; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1 -; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1 -; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1 -; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1 -; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1 -; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1 -; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1 -; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 +; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[B:%.*]], i32 1 +; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] +; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -313,21 +313,18 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 undef to i16 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i64 0 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR0]], align 16 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 1 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR1]], align 4 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 2 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR2]], align 8 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 3 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR3]], align 4 ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 4 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR4]], align 16 ; CHECK-NEXT: [[PTR5:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 5 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR5]], align 4 ; CHECK-NEXT: [[PTR6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 6 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR6]], align 8 ; CHECK-NEXT: [[PTR7:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 7 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[TMP2]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[PTR0]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; %1 = load i16, i16* %v1, align 4