Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -541,6 +541,7 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + RemovedOprations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -600,6 +601,9 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + /// Reduce the cost of tree to make it patrially vectorizable, if possible. + bool reduceTreeCost(int Delta); + OptimizationRemarkEmitter *getORE() { return ORE; } private: @@ -700,6 +704,9 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Cost of the tree entry. + int Cost; }; /// Create a new VectorizableTree entry. @@ -742,6 +749,9 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOprations; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1170,6 +1180,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS, ArrayRef Oprations); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -2413,6 +2426,70 @@ } } +bool BoUpSLP::reduceTreeCost(int Delta) { + SmallVector Tree; + bool Reduced = false; + int CostSum = 0; + + // Look at the tree elements in a backward way to the top, + // First, it should be gathering nodes, if any exist, and + // then actual vectorizable operation. So, by walking + // backward we could find the real cost of every operation by + // summing gathering nodes and actual operation node that + // follows gathering node(s). + for (unsigned I = VectorizableTree.size(); I--;) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) { + CostSum += Entry->Cost; + Tree.push_back(CostSum); + CostSum = 0; + } else { + CostSum += Entry->Cost; + } + } + + // Estimating where to stop vectorization. + CostSum = 0; + unsigned StopAt = 0; + for (unsigned I = 0, E = Tree.size(); I < E; I++) { + CostSum += Tree[I]; + if (CostSum >= Delta) { + StopAt = I; + break; + } + } + + // Canceling unprofitable elements. + if (StopAt > 0 && StopAt < (Tree.size() - 1)) { + LLVM_DEBUG(dbgs() << "SLP: Reduced the tree cost by " << Delta + << " to make it partially vectorizable.\n"); + Reduced = true; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) { + if (I >= StopAt) { + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out proposed to vectorize.\n"); + RemovedOprations.push_back(I); + ScalarToTreeEntry.erase(V); + MustGather.insert(V); + ExternalUses.erase(std::remove_if(ExternalUses.begin(), + ExternalUses.end(), + [&](ExternalUser &EU) { + return EU.Scalar == V; + }), + ExternalUses.end()); + } + } + } + } + } + + return Reduced; +} + bool BoUpSLP::isFullyVectorizableTinyTree() { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); @@ -2468,7 +2545,8 @@ SmallPtrSet LiveValues; Instruction *PrevInst = nullptr; - for (const auto &N : VectorizableTree) { + for (auto &N : VectorizableTree) { + int EntryCost = 0; Instruction *Inst = dyn_cast(N.Scalars[0]); if (!Inst) continue; @@ -2508,15 +2586,19 @@ !isa(&*PrevInstIt)) && &*PrevInstIt != PrevInst) { SmallVector V; + int ElementCost; for (auto *II : LiveValues) V.push_back(VectorType::get(II->getType(), BundleWidth)); - Cost += TTI->getCostOfKeepingLiveOverCall(V); + ElementCost = TTI->getCostOfKeepingLiveOverCall(V); + Cost += ElementCost; + EntryCost += ElementCost; } ++PrevInstIt; } PrevInst = Inst; + N.Cost += EntryCost; } return Cost; @@ -2552,6 +2634,7 @@ continue; int C = getEntryCost(&TE); + TE.Cost = C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); @@ -3574,7 +3657,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!RemovedOprations.empty()) + removeFromScheduling(BS, RemovedOprations); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -3702,15 +3790,8 @@ Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { -#ifndef NDEBUG - for (User *U : Scalar->users()) { - LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - - // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); - } -#endif + // The tree might not be fully vectorized, so we don't have to + // check every user. Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); } @@ -4186,6 +4267,33 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS, ArrayRef Oprations) { + bool Removed = false; + for (int I: Oprations) { + TreeEntry *Entry = &VectorizableTree[I]; + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (Removed) { + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, [&](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4682,7 +4790,16 @@ const unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - const unsigned Sz = R.getVectorElementSize(Chain[0]); + Value *FirstStore = nullptr; + for (Value *V : Chain) { + assert(isa(V) && "Expected only StoreInst here!"); + if (StoreInst *SI = cast(V)) + if (SI->getValueOperand()) + FirstStore = V; + } + if (!FirstStore) + return false; + const unsigned Sz = R.getVectorElementSize(FirstStore); const unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -4696,13 +4813,17 @@ for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) - continue; - LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i << "\n"); ArrayRef Operands = Chain.slice(i, VF); + // Skip if any store instruction vectorized. + if (std::any_of(Operands.begin(), Operands.end(), + [](Value *V) { + return (!(cast(V))->getValueOperand()); + })) + continue; + R.buildTree(Operands); if (R.isTreeTinyAndNotFullyVectorizable()) continue; @@ -4729,6 +4850,16 @@ // Move to the next bundle. i += VF - 1; Changed = true; + } else { + // Try to reduce the tree to make it patrially vectorizable. + int Delta = Cost - SLPCostThreshold + 1; + if (R.getTreeSize() > 1) { + Changed = R.reduceTreeCost(Delta); + if (Changed) { + R.vectorizeTree(); + i += VF - 1; + } + } } } @@ -4802,7 +4933,6 @@ Size /= 2) { if (vectorizeStoreChain(Operands, R, Size)) { // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Operands.begin(), Operands.end()); Changed = true; break; } Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -39,7 +39,6 @@ ; CHECK-LABEL: @store_chain_v2i64( ; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 ; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 ; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 ; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 ; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 @@ -50,8 +49,10 @@ ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 Index: test/Transforms/SLPVectorizer/NVPTX/v2f16.ll =================================================================== --- test/Transforms/SLPVectorizer/NVPTX/v2f16.ll +++ test/Transforms/SLPVectorizer/NVPTX/v2f16.ll @@ -33,16 +33,17 @@ ; NOVECTOR-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]] ; NOVECTOR-NEXT: [[TMP12:%.*]] = load half, half* [[TMP11]], align 8 ; NOVECTOR-NEXT: [[TMP13:%.*]] = fmul fast half [[TMP12]], 0xH5380 -; NOVECTOR-NEXT: [[TMP14:%.*]] = fadd fast half [[TMP13]], 0xH57F0 ; NOVECTOR-NEXT: [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half* ; NOVECTOR-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]] -; NOVECTOR-NEXT: store half [[TMP14]], half* [[TMP16]], align 8 ; NOVECTOR-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]] ; NOVECTOR-NEXT: [[TMP18:%.*]] = load half, half* [[TMP17]], align 2 ; NOVECTOR-NEXT: [[TMP19:%.*]] = fmul fast half [[TMP18]], 0xH5380 -; NOVECTOR-NEXT: [[TMP20:%.*]] = fadd fast half [[TMP19]], 0xH57F0 +; NOVECTOR-NEXT: [[TMP1:%.*]] = insertelement <2 x half> undef, half [[TMP13]], i32 0 +; NOVECTOR-NEXT: [[TMP2:%.*]] = insertelement <2 x half> [[TMP1]], half [[TMP19]], i32 1 +; NOVECTOR-NEXT: [[TMP3:%.*]] = fadd fast <2 x half> , [[TMP2]] ; NOVECTOR-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] -; NOVECTOR-NEXT: store half [[TMP20]], half* [[TMP21]], align 2 +; NOVECTOR-NEXT: [[TMP4:%.*]] = bitcast half* [[TMP16]] to <2 x half>* +; NOVECTOR-NEXT: store <2 x half> [[TMP3]], <2 x half>* [[TMP4]], align 8 ; NOVECTOR-NEXT: ret void ; %tmp = shl nuw nsw i32 %arg2, 6 Index: test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll +++ test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll @@ -27,92 +27,148 @@ define void @add_v8i64() { ; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]]) +; SSE-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SSE-NEXT: [[TMP26:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]]) +; SSE-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]]) +; SSE-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]]) +; SSE-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SLM-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SLM-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SLM-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SLM-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SLM-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SLM-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SLM-NEXT: [[TMP17:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]]) +; SLM-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SLM-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SLM-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SLM-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SLM-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SLM-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SLM-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SLM-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SLM-NEXT: [[TMP26:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]]) +; SLM-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SLM-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SLM-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SLM-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SLM-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SLM-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SLM-NEXT: [[TMP35:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]]) +; SLM-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SLM-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SLM-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SLM-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SLM-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SLM-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SLM-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SLM-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SLM-NEXT: [[TMP44:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]]) +; SLM-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX1-LABEL: @add_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2 +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3 +; AVX1-NEXT: [[TMP21:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP20]]) +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3 +; AVX1-NEXT: [[TMP38:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP29]], <4 x i64> [[TMP37]]) +; AVX1-NEXT: store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX1-NEXT: store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @add_v8i64( Index: test/Transforms/SLPVectorizer/X86/arith-add-usat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-add-usat.ll +++ test/Transforms/SLPVectorizer/X86/arith-add-usat.ll @@ -27,38 +27,54 @@ define void @add_v8i64() { ; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]]) +; SSE-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SSE-NEXT: [[TMP26:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]]) +; SSE-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]]) +; SSE-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]]) +; SSE-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( Index: test/Transforms/SLPVectorizer/X86/arith-mul.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -22,108 +22,148 @@ define void @mul_v8i64() { ; SSE-LABEL: @mul_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = mul <2 x i64> [[TMP12]], [[TMP16]] +; SSE-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SSE-NEXT: [[TMP26:%.*]] = mul <2 x i64> [[TMP21]], [[TMP25]] +; SSE-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = mul <2 x i64> [[TMP30]], [[TMP34]] +; SSE-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = mul <2 x i64> [[TMP39]], [[TMP43]] +; SSE-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @mul_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] -; SLM-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] -; SLM-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] -; SLM-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] -; SLM-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] -; SLM-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] -; SLM-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] -; SLM-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SLM-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SLM-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SLM-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SLM-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SLM-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SLM-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SLM-NEXT: [[TMP17:%.*]] = mul <2 x i64> [[TMP12]], [[TMP16]] +; SLM-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SLM-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SLM-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SLM-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SLM-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SLM-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SLM-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SLM-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SLM-NEXT: [[TMP26:%.*]] = mul <2 x i64> [[TMP21]], [[TMP25]] +; SLM-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SLM-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SLM-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SLM-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SLM-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SLM-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SLM-NEXT: [[TMP35:%.*]] = mul <2 x i64> [[TMP30]], [[TMP34]] +; SLM-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SLM-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SLM-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SLM-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SLM-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SLM-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SLM-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SLM-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SLM-NEXT: [[TMP44:%.*]] = mul <2 x i64> [[TMP39]], [[TMP43]] +; SLM-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX1-LABEL: @mul_v8i64( -; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] -; AVX1-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] -; AVX1-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] -; AVX1-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] -; AVX1-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] -; AVX1-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] -; AVX1-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] -; AVX1-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] -; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2 +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3 +; AVX1-NEXT: [[TMP21:%.*]] = mul <4 x i64> [[TMP12]], [[TMP20]] +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3 +; AVX1-NEXT: [[TMP38:%.*]] = mul <4 x i64> [[TMP29]], [[TMP37]] +; AVX1-NEXT: store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX1-NEXT: store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @mul_v8i64( Index: test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll +++ test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll @@ -27,92 +27,148 @@ define void @sub_v8i64() { ; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]]) +; SSE-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SSE-NEXT: [[TMP26:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]]) +; SSE-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]]) +; SSE-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]]) +; SSE-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SLM-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SLM-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SLM-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SLM-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SLM-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SLM-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SLM-NEXT: [[TMP17:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]]) +; SLM-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SLM-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SLM-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SLM-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SLM-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SLM-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SLM-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SLM-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SLM-NEXT: [[TMP26:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]]) +; SLM-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SLM-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SLM-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SLM-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SLM-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SLM-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SLM-NEXT: [[TMP35:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]]) +; SLM-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SLM-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SLM-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SLM-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SLM-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SLM-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SLM-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SLM-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SLM-NEXT: [[TMP44:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]]) +; SLM-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX1-LABEL: @sub_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2 +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3 +; AVX1-NEXT: [[TMP21:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP20]]) +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3 +; AVX1-NEXT: [[TMP38:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP29]], <4 x i64> [[TMP37]]) +; AVX1-NEXT: store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX1-NEXT: store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @sub_v8i64( Index: test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll +++ test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll @@ -27,38 +27,54 @@ define void @sub_v8i64() { ; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[TMP16]]) +; SSE-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SSE-NEXT: [[TMP26:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP21]], <2 x i64> [[TMP25]]) +; SSE-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP30]], <2 x i64> [[TMP34]]) +; SSE-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP39]], <2 x i64> [[TMP43]]) +; SSE-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( Index: test/Transforms/SLPVectorizer/X86/bswap.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/bswap.ll +++ test/Transforms/SLPVectorizer/X86/bswap.ll @@ -22,8 +22,9 @@ ; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 ; SSE-NEXT: [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]]) ; SSE-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]]) -; SSE-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[BSWAP0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[BSWAP1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX-LABEL: @bswap_2i64( @@ -51,10 +52,12 @@ ; SSE-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]]) ; SSE-NEXT: [[BSWAP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD2]]) ; SSE-NEXT: [[BSWAP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD3]]) -; SSE-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; SSE-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; SSE-NEXT: store i64 [[BSWAP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; SSE-NEXT: store i64 [[BSWAP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[BSWAP0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[BSWAP1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[BSWAP2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[BSWAP3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @bswap_4i64( Index: test/Transforms/SLPVectorizer/X86/cast.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cast.ll +++ test/Transforms/SLPVectorizer/X86/cast.ll @@ -81,15 +81,20 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP3]] to i64 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 3 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[CONV6]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP9]], i64 [[CONV9]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[A:%.*]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[TMP10]], <4 x i64>* [[TMP11]], align 4 ; CHECK-NEXT: ret i64 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/ctlz.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/ctlz.ll +++ test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -30,8 +30,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 ; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false) ; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1 +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -44,20 +45,38 @@ } define void @ctlz_4i64() #0 { -; CHECK-LABEL: @ctlz_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false) -; CHECK-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @ctlz_4i64( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; SSE-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false) +; SSE-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false) +; SSE-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false) +; SSE-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false) +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTLZ3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ctlz_4i64( +; AVX-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; AVX-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; AVX-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; AVX-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; AVX-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false) +; AVX-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false) +; AVX-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false) +; AVX-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false) +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTLZ0]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTLZ2]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTLZ3]], i32 3 +; AVX-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -84,10 +103,11 @@ ; CHECK-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) ; CHECK-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) ; CHECK-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; CHECK-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -123,14 +143,16 @@ ; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) ; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) ; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) -; SSE-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTLZ5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @ctlz_8i32( @@ -150,14 +172,15 @@ ; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) ; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) ; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) -; AVX1-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTLZ0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2 +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3 +; AVX1-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTLZ4]], i32 4 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTLZ5]], i32 5 +; AVX1-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTLZ6]], i32 6 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTLZ7]], i32 7 +; AVX1-NEXT: store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ctlz_8i32( @@ -471,8 +494,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 ; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) ; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1 +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -485,20 +509,38 @@ } define void @ctlz_undef_4i64() #0 { -; CHECK-LABEL: @ctlz_undef_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true) -; CHECK-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @ctlz_undef_4i64( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; SSE-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) +; SSE-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true) +; SSE-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true) +; SSE-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true) +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTLZ2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTLZ3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ctlz_undef_4i64( +; AVX-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; AVX-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; AVX-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; AVX-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; AVX-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) +; AVX-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true) +; AVX-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true) +; AVX-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true) +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTLZ0]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTLZ1]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTLZ2]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTLZ3]], i32 3 +; AVX-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -525,10 +567,11 @@ ; CHECK-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true) ; CHECK-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true) ; CHECK-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true) -; CHECK-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -564,14 +607,16 @@ ; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true) ; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true) ; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true) -; SSE-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTLZ4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTLZ5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @ctlz_undef_8i32( @@ -591,14 +636,15 @@ ; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true) ; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true) ; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true) -; AVX1-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTLZ0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTLZ1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTLZ2]], i32 2 +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTLZ3]], i32 3 +; AVX1-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTLZ4]], i32 4 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTLZ5]], i32 5 +; AVX1-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTLZ6]], i32 6 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTLZ7]], i32 7 +; AVX1-NEXT: store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ctlz_undef_8i32( Index: test/Transforms/SLPVectorizer/X86/ctpop.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/ctpop.ll +++ test/Transforms/SLPVectorizer/X86/ctpop.ll @@ -26,8 +26,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 ; CHECK-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) ; CHECK-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) -; CHECK-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTPOP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTPOP1]], i32 1 +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -49,10 +50,12 @@ ; SSE-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) ; SSE-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]]) ; SSE-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]]) -; SSE-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; SSE-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; SSE-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; SSE-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTPOP0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTPOP1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTPOP2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTPOP3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @ctpop_4i64( @@ -64,10 +67,11 @@ ; AVX1-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) ; AVX1-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]]) ; AVX1-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]]) -; AVX1-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; AVX1-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; AVX1-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; AVX1-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTPOP0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTPOP1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTPOP2]], i32 2 +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTPOP3]], i32 3 +; AVX1-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ctpop_4i64( @@ -107,10 +111,11 @@ ; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) ; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) ; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; SSE42-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP0]], i32 0 +; SSE42-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1 +; SSE42-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2 +; SSE42-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3 +; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; SSE42-NEXT: ret void ; ; AVX-LABEL: @ctpop_4i32( @@ -122,10 +127,11 @@ ; AVX-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) ; AVX-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) ; AVX-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; AVX-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP0]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3 +; AVX-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; AVX-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -170,14 +176,16 @@ ; SSE42-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]]) ; SSE42-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]]) ; SSE42-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]]) -; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE42-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE42-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE42-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE42-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE42-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP0]], i32 0 +; SSE42-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1 +; SSE42-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2 +; SSE42-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3 +; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTPOP4]], i32 0 +; SSE42-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTPOP5]], i32 1 +; SSE42-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP6]], i32 2 +; SSE42-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP7]], i32 3 +; SSE42-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE42-NEXT: ret void ; ; AVX1-LABEL: @ctpop_8i32( @@ -197,14 +205,15 @@ ; AVX1-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]]) ; AVX1-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]]) ; AVX1-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]]) -; AVX1-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTPOP0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTPOP1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTPOP2]], i32 2 +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTPOP3]], i32 3 +; AVX1-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTPOP4]], i32 4 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTPOP5]], i32 5 +; AVX1-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTPOP6]], i32 6 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTPOP7]], i32 7 +; AVX1-NEXT: store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ctpop_8i32( Index: test/Transforms/SLPVectorizer/X86/cttz.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cttz.ll +++ test/Transforms/SLPVectorizer/X86/cttz.ll @@ -30,8 +30,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 ; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false) ; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1 +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -44,20 +45,38 @@ } define void @cttz_4i64() #0 { -; CHECK-LABEL: @cttz_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false) -; CHECK-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @cttz_4i64( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; SSE-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false) +; SSE-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false) +; SSE-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false) +; SSE-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false) +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTTZ3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @cttz_4i64( +; AVX-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; AVX-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; AVX-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; AVX-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; AVX-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false) +; AVX-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false) +; AVX-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false) +; AVX-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false) +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTTZ0]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTTZ2]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTTZ3]], i32 3 +; AVX-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -84,10 +103,11 @@ ; CHECK-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) ; CHECK-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) ; CHECK-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) -; CHECK-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -123,14 +143,16 @@ ; SSE-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false) ; SSE-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false) ; SSE-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false) -; SSE-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTTZ5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @cttz_8i32( @@ -150,14 +172,15 @@ ; AVX1-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false) ; AVX1-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false) ; AVX1-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false) -; AVX1-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTTZ0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2 +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3 +; AVX1-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTTZ4]], i32 4 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTTZ5]], i32 5 +; AVX1-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTTZ6]], i32 6 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTTZ7]], i32 7 +; AVX1-NEXT: store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @cttz_8i32( @@ -471,8 +494,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 ; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) ; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1 +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -485,20 +509,38 @@ } define void @cttz_undef_4i64() #0 { -; CHECK-LABEL: @cttz_undef_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true) -; CHECK-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @cttz_undef_4i64( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; SSE-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) +; SSE-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true) +; SSE-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true) +; SSE-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true) +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CTTZ2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CTTZ3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @cttz_undef_4i64( +; AVX-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; AVX-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; AVX-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; AVX-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; AVX-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) +; AVX-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true) +; AVX-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true) +; AVX-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true) +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CTTZ0]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CTTZ1]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CTTZ2]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CTTZ3]], i32 3 +; AVX-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -525,10 +567,11 @@ ; CHECK-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true) ; CHECK-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true) ; CHECK-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true) -; CHECK-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -564,14 +607,16 @@ ; SSE-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true) ; SSE-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true) ; SSE-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true) -; SSE-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CTTZ4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CTTZ5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @cttz_undef_8i32( @@ -591,14 +636,15 @@ ; AVX1-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true) ; AVX1-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true) ; AVX1-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true) -; AVX1-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CTTZ0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CTTZ1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CTTZ2]], i32 2 +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CTTZ3]], i32 3 +; AVX1-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CTTZ4]], i32 4 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CTTZ5]], i32 5 +; AVX1-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CTTZ6]], i32 6 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CTTZ7]], i32 7 +; AVX1-NEXT: store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @cttz_undef_8i32( Index: test/Transforms/SLPVectorizer/X86/fma.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fma.ll +++ test/Transforms/SLPVectorizer/X86/fma.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256 @@ -26,16 +25,19 @@ define void @fma_2f64() #0 { ; NO-FMA-LABEL: @fma_2f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP5]], double [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP8]], i32 0 +; NO-FMA-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP10]], i32 1 +; NO-FMA-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP7]], <2 x double> [[TMP11]]) +; NO-FMA-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_2f64( @@ -61,26 +63,27 @@ define void @fma_4f64() #0 { ; NO-FMA-LABEL: @fma_4f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 8 +; NO-FMA-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 2 +; NO-FMA-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i32 2 +; NO-FMA-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3 +; NO-FMA-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i32 3 +; NO-FMA-NEXT: [[TMP12:%.*]] = extractelement <4 x double> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP13:%.*]] = insertelement <4 x double> undef, double [[TMP12]], i32 0 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <4 x double> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i32 1 +; NO-FMA-NEXT: [[TMP16:%.*]] = extractelement <4 x double> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i32 2 +; NO-FMA-NEXT: [[TMP18:%.*]] = extractelement <4 x double> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i32 3 +; NO-FMA-NEXT: [[TMP20:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP11]], <4 x double> [[TMP19]]) +; NO-FMA-NEXT: store <4 x double> [[TMP20]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_4f64( @@ -116,46 +119,48 @@ define void @fma_8f64() #0 { ; NO-FMA-LABEL: @fma_8f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call double @llvm.fma.f64(double [[A4]], double [[B4]], double [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call double @llvm.fma.f64(double [[A5]], double [[B5]], double [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call double @llvm.fma.f64(double [[A6]], double [[B6]], double [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call double @llvm.fma.f64(double [[A7]], double [[B7]], double [[C7]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP8:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 0 +; NO-FMA-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP10:%.*]] = insertelement <4 x double> [[TMP8]], double [[TMP9]], i32 1 +; NO-FMA-NEXT: [[TMP11:%.*]] = extractelement <4 x double> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP11]], i32 2 +; NO-FMA-NEXT: [[TMP13:%.*]] = extractelement <4 x double> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[TMP14:%.*]] = insertelement <4 x double> [[TMP12]], double [[TMP13]], i32 3 +; NO-FMA-NEXT: [[TMP15:%.*]] = extractelement <4 x double> [[TMP5]], i32 0 +; NO-FMA-NEXT: [[TMP16:%.*]] = insertelement <4 x double> undef, double [[TMP15]], i32 0 +; NO-FMA-NEXT: [[TMP17:%.*]] = extractelement <4 x double> [[TMP5]], i32 1 +; NO-FMA-NEXT: [[TMP18:%.*]] = insertelement <4 x double> [[TMP16]], double [[TMP17]], i32 1 +; NO-FMA-NEXT: [[TMP19:%.*]] = extractelement <4 x double> [[TMP5]], i32 2 +; NO-FMA-NEXT: [[TMP20:%.*]] = insertelement <4 x double> [[TMP18]], double [[TMP19]], i32 2 +; NO-FMA-NEXT: [[TMP21:%.*]] = extractelement <4 x double> [[TMP5]], i32 3 +; NO-FMA-NEXT: [[TMP22:%.*]] = insertelement <4 x double> [[TMP20]], double [[TMP21]], i32 3 +; NO-FMA-NEXT: [[TMP23:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP14]], <4 x double> [[TMP22]]) +; NO-FMA-NEXT: [[TMP24:%.*]] = extractelement <4 x double> [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP25:%.*]] = insertelement <4 x double> undef, double [[TMP24]], i32 0 +; NO-FMA-NEXT: [[TMP26:%.*]] = extractelement <4 x double> [[TMP4]], i32 1 +; NO-FMA-NEXT: [[TMP27:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP26]], i32 1 +; NO-FMA-NEXT: [[TMP28:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 +; NO-FMA-NEXT: [[TMP29:%.*]] = insertelement <4 x double> [[TMP27]], double [[TMP28]], i32 2 +; NO-FMA-NEXT: [[TMP30:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 +; NO-FMA-NEXT: [[TMP31:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP30]], i32 3 +; NO-FMA-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP6]], i32 0 +; NO-FMA-NEXT: [[TMP33:%.*]] = insertelement <4 x double> undef, double [[TMP32]], i32 0 +; NO-FMA-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP35:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP34]], i32 1 +; NO-FMA-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[TMP6]], i32 2 +; NO-FMA-NEXT: [[TMP37:%.*]] = insertelement <4 x double> [[TMP35]], double [[TMP36]], i32 2 +; NO-FMA-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[TMP6]], i32 3 +; NO-FMA-NEXT: [[TMP39:%.*]] = insertelement <4 x double> [[TMP37]], double [[TMP38]], i32 3 +; NO-FMA-NEXT: [[TMP40:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP31]], <4 x double> [[TMP39]]) +; NO-FMA-NEXT: store <4 x double> [[TMP23]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; NO-FMA-NEXT: store <4 x double> [[TMP40]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA256-LABEL: @fma_8f64( @@ -224,26 +229,27 @@ define void @fma_4f32() #0 { ; NO-FMA-LABEL: @fma_4f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; NO-FMA-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP8]], i32 2 +; NO-FMA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; NO-FMA-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP10]], i32 3 +; NO-FMA-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[TMP12]], i32 0 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i32 1 +; NO-FMA-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP16]], i32 2 +; NO-FMA-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 3 +; NO-FMA-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP11]], <4 x float> [[TMP19]]) +; NO-FMA-NEXT: store <4 x float> [[TMP20]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_4f32( @@ -279,46 +285,43 @@ define void @fma_8f32() #0 { ; NO-FMA-LABEL: @fma_8f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP5:%.*]] = insertelement <8 x float> undef, float [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2 +; NO-FMA-NEXT: [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2 +; NO-FMA-NEXT: [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3 +; NO-FMA-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3 +; NO-FMA-NEXT: [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4 +; NO-FMA-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5 +; NO-FMA-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5 +; NO-FMA-NEXT: [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6 +; NO-FMA-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6 +; NO-FMA-NEXT: [[TMP18:%.*]] = extractelement <8 x float> [[TMP2]], i32 7 +; NO-FMA-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP18]], i32 7 +; NO-FMA-NEXT: [[TMP20:%.*]] = extractelement <8 x float> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP21:%.*]] = insertelement <8 x float> undef, float [[TMP20]], i32 0 +; NO-FMA-NEXT: [[TMP22:%.*]] = extractelement <8 x float> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP23:%.*]] = insertelement <8 x float> [[TMP21]], float [[TMP22]], i32 1 +; NO-FMA-NEXT: [[TMP24:%.*]] = extractelement <8 x float> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[TMP25:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP24]], i32 2 +; NO-FMA-NEXT: [[TMP26:%.*]] = extractelement <8 x float> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[TMP27:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP26]], i32 3 +; NO-FMA-NEXT: [[TMP28:%.*]] = extractelement <8 x float> [[TMP3]], i32 4 +; NO-FMA-NEXT: [[TMP29:%.*]] = insertelement <8 x float> [[TMP27]], float [[TMP28]], i32 4 +; NO-FMA-NEXT: [[TMP30:%.*]] = extractelement <8 x float> [[TMP3]], i32 5 +; NO-FMA-NEXT: [[TMP31:%.*]] = insertelement <8 x float> [[TMP29]], float [[TMP30]], i32 5 +; NO-FMA-NEXT: [[TMP32:%.*]] = extractelement <8 x float> [[TMP3]], i32 6 +; NO-FMA-NEXT: [[TMP33:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP32]], i32 6 +; NO-FMA-NEXT: [[TMP34:%.*]] = extractelement <8 x float> [[TMP3]], i32 7 +; NO-FMA-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP34]], i32 7 +; NO-FMA-NEXT: [[TMP36:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP19]], <8 x float> [[TMP35]]) +; NO-FMA-NEXT: store <8 x float> [[TMP36]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_8f32( @@ -374,86 +377,80 @@ define void @fma_16f32() #0 { ; NO-FMA-LABEL: @fma_16f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[C9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[C10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[C11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[C12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[C13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[C14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[C15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]]) -; NO-FMA-NEXT: [[FMA8:%.*]] = call float @llvm.fma.f32(float [[A8]], float [[B8]], float [[C8]]) -; NO-FMA-NEXT: [[FMA9:%.*]] = call float @llvm.fma.f32(float [[A9]], float [[B9]], float [[C9]]) -; NO-FMA-NEXT: [[FMA10:%.*]] = call float @llvm.fma.f32(float [[A10]], float [[B10]], float [[C10]]) -; NO-FMA-NEXT: [[FMA11:%.*]] = call float @llvm.fma.f32(float [[A11]], float [[B11]], float [[C11]]) -; NO-FMA-NEXT: [[FMA12:%.*]] = call float @llvm.fma.f32(float [[A12]], float [[B12]], float [[C12]]) -; NO-FMA-NEXT: [[FMA13:%.*]] = call float @llvm.fma.f32(float [[A13]], float [[B13]], float [[C13]]) -; NO-FMA-NEXT: [[FMA14:%.*]] = call float @llvm.fma.f32(float [[A14]], float [[B14]], float [[C14]]) -; NO-FMA-NEXT: [[FMA15:%.*]] = call float @llvm.fma.f32(float [[A15]], float [[B15]], float [[C15]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: store float [[FMA8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: store float [[FMA9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP8:%.*]] = insertelement <8 x float> undef, float [[TMP7]], i32 0 +; NO-FMA-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 1 +; NO-FMA-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 2 +; NO-FMA-NEXT: [[TMP13:%.*]] = extractelement <8 x float> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 3 +; NO-FMA-NEXT: [[TMP15:%.*]] = extractelement <8 x float> [[TMP3]], i32 4 +; NO-FMA-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 4 +; NO-FMA-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP3]], i32 5 +; NO-FMA-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 5 +; NO-FMA-NEXT: [[TMP19:%.*]] = extractelement <8 x float> [[TMP3]], i32 6 +; NO-FMA-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 6 +; NO-FMA-NEXT: [[TMP21:%.*]] = extractelement <8 x float> [[TMP3]], i32 7 +; NO-FMA-NEXT: [[TMP22:%.*]] = insertelement <8 x float> [[TMP20]], float [[TMP21]], i32 7 +; NO-FMA-NEXT: [[TMP23:%.*]] = extractelement <8 x float> [[TMP5]], i32 0 +; NO-FMA-NEXT: [[TMP24:%.*]] = insertelement <8 x float> undef, float [[TMP23]], i32 0 +; NO-FMA-NEXT: [[TMP25:%.*]] = extractelement <8 x float> [[TMP5]], i32 1 +; NO-FMA-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP25]], i32 1 +; NO-FMA-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[TMP5]], i32 2 +; NO-FMA-NEXT: [[TMP28:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP27]], i32 2 +; NO-FMA-NEXT: [[TMP29:%.*]] = extractelement <8 x float> [[TMP5]], i32 3 +; NO-FMA-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP28]], float [[TMP29]], i32 3 +; NO-FMA-NEXT: [[TMP31:%.*]] = extractelement <8 x float> [[TMP5]], i32 4 +; NO-FMA-NEXT: [[TMP32:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP31]], i32 4 +; NO-FMA-NEXT: [[TMP33:%.*]] = extractelement <8 x float> [[TMP5]], i32 5 +; NO-FMA-NEXT: [[TMP34:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP33]], i32 5 +; NO-FMA-NEXT: [[TMP35:%.*]] = extractelement <8 x float> [[TMP5]], i32 6 +; NO-FMA-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP35]], i32 6 +; NO-FMA-NEXT: [[TMP37:%.*]] = extractelement <8 x float> [[TMP5]], i32 7 +; NO-FMA-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP37]], i32 7 +; NO-FMA-NEXT: [[TMP39:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP22]], <8 x float> [[TMP38]]) +; NO-FMA-NEXT: [[TMP40:%.*]] = extractelement <8 x float> [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP41:%.*]] = insertelement <8 x float> undef, float [[TMP40]], i32 0 +; NO-FMA-NEXT: [[TMP42:%.*]] = extractelement <8 x float> [[TMP4]], i32 1 +; NO-FMA-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP42]], i32 1 +; NO-FMA-NEXT: [[TMP44:%.*]] = extractelement <8 x float> [[TMP4]], i32 2 +; NO-FMA-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP44]], i32 2 +; NO-FMA-NEXT: [[TMP46:%.*]] = extractelement <8 x float> [[TMP4]], i32 3 +; NO-FMA-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP46]], i32 3 +; NO-FMA-NEXT: [[TMP48:%.*]] = extractelement <8 x float> [[TMP4]], i32 4 +; NO-FMA-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP48]], i32 4 +; NO-FMA-NEXT: [[TMP50:%.*]] = extractelement <8 x float> [[TMP4]], i32 5 +; NO-FMA-NEXT: [[TMP51:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP50]], i32 5 +; NO-FMA-NEXT: [[TMP52:%.*]] = extractelement <8 x float> [[TMP4]], i32 6 +; NO-FMA-NEXT: [[TMP53:%.*]] = insertelement <8 x float> [[TMP51]], float [[TMP52]], i32 6 +; NO-FMA-NEXT: [[TMP54:%.*]] = extractelement <8 x float> [[TMP4]], i32 7 +; NO-FMA-NEXT: [[TMP55:%.*]] = insertelement <8 x float> [[TMP53]], float [[TMP54]], i32 7 +; NO-FMA-NEXT: [[TMP56:%.*]] = extractelement <8 x float> [[TMP6]], i32 0 +; NO-FMA-NEXT: [[TMP57:%.*]] = insertelement <8 x float> undef, float [[TMP56]], i32 0 +; NO-FMA-NEXT: [[TMP58:%.*]] = extractelement <8 x float> [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP59:%.*]] = insertelement <8 x float> [[TMP57]], float [[TMP58]], i32 1 +; NO-FMA-NEXT: [[TMP60:%.*]] = extractelement <8 x float> [[TMP6]], i32 2 +; NO-FMA-NEXT: [[TMP61:%.*]] = insertelement <8 x float> [[TMP59]], float [[TMP60]], i32 2 +; NO-FMA-NEXT: [[TMP62:%.*]] = extractelement <8 x float> [[TMP6]], i32 3 +; NO-FMA-NEXT: [[TMP63:%.*]] = insertelement <8 x float> [[TMP61]], float [[TMP62]], i32 3 +; NO-FMA-NEXT: [[TMP64:%.*]] = extractelement <8 x float> [[TMP6]], i32 4 +; NO-FMA-NEXT: [[TMP65:%.*]] = insertelement <8 x float> [[TMP63]], float [[TMP64]], i32 4 +; NO-FMA-NEXT: [[TMP66:%.*]] = extractelement <8 x float> [[TMP6]], i32 5 +; NO-FMA-NEXT: [[TMP67:%.*]] = insertelement <8 x float> [[TMP65]], float [[TMP66]], i32 5 +; NO-FMA-NEXT: [[TMP68:%.*]] = extractelement <8 x float> [[TMP6]], i32 6 +; NO-FMA-NEXT: [[TMP69:%.*]] = insertelement <8 x float> [[TMP67]], float [[TMP68]], i32 6 +; NO-FMA-NEXT: [[TMP70:%.*]] = extractelement <8 x float> [[TMP6]], i32 7 +; NO-FMA-NEXT: [[TMP71:%.*]] = insertelement <8 x float> [[TMP69]], float [[TMP70]], i32 7 +; NO-FMA-NEXT: [[TMP72:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP55]], <8 x float> [[TMP71]]) +; NO-FMA-NEXT: store <8 x float> [[TMP39]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; NO-FMA-NEXT: store <8 x float> [[TMP72]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA256-LABEL: @fma_16f32( Index: test/Transforms/SLPVectorizer/X86/fptosi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fptosi.ll +++ test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -37,14 +37,18 @@ ; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 ; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 ; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptosi_8f64_8i64( @@ -64,14 +68,16 @@ ; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 ; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 ; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptosi_8f64_8i64( @@ -161,30 +167,31 @@ define void @fptosi_8f64_8i16() #0 { ; SSE-LABEL: @fptosi_8f64_8i16( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i16 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i16 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i16 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i16 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i16 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i16 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i16 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i16 -; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i16> +; SSE-NEXT: [[TMP6:%.*]] = fptosi <2 x double> [[TMP2]] to <2 x i16> +; SSE-NEXT: [[TMP7:%.*]] = fptosi <2 x double> [[TMP3]] to <2 x i16> +; SSE-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP4]] to <2 x i16> +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <2 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <2 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP24]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @fptosi_8f64_8i16( @@ -297,14 +304,18 @@ ; SSE-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 ; SSE-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 ; SSE-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptosi_8f32_8i64( @@ -324,14 +335,16 @@ ; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 ; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 ; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptosi_8f32_8i64( Index: test/Transforms/SLPVectorizer/X86/fptoui.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fptoui.ll +++ test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -37,14 +37,18 @@ ; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i64 ; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i64 ; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f64_8i64( @@ -64,14 +68,16 @@ ; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i64 ; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i64 ; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f64_8i64( @@ -134,41 +140,44 @@ ; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i32 ; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i32 ; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CVT1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CVT2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CVT3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CVT5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CVT6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CVT7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f64_8i32( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i32 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i32 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i32 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i32 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i32 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i32 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i32 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i32 -; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i32> +; AVX256NODQ-NEXT: [[TMP6:%.*]] = fptoui <2 x double> [[TMP2]] to <2 x i32> +; AVX256NODQ-NEXT: [[TMP7:%.*]] = fptoui <2 x double> [[TMP3]] to <2 x i32> +; AVX256NODQ-NEXT: [[TMP8:%.*]] = fptoui <2 x double> [[TMP4]] to <2 x i32> +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> undef, i32 [[TMP9]], i32 0 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP11]], i32 1 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[TMP13]], i32 2 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP15]], i32 3 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP17]], i32 4 +; AVX256NODQ-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 5 +; AVX256NODQ-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0 +; AVX256NODQ-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 6 +; AVX256NODQ-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1 +; AVX256NODQ-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 7 +; AVX256NODQ-NEXT: store <8 x i32> [[TMP24]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f64_8i32( @@ -212,57 +221,59 @@ define void @fptoui_8f64_8i16() #0 { ; SSE-LABEL: @fptoui_8f64_8i16( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i16 -; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i16 -; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i16 -; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i16 -; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i16 -; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i16 -; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i16 -; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i16 -; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i16> +; SSE-NEXT: [[TMP6:%.*]] = fptoui <2 x double> [[TMP2]] to <2 x i16> +; SSE-NEXT: [[TMP7:%.*]] = fptoui <2 x double> [[TMP3]] to <2 x i16> +; SSE-NEXT: [[TMP8:%.*]] = fptoui <2 x double> [[TMP4]] to <2 x i16> +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <2 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <2 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP24]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f64_8i16( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i16 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i16 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i16 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i16 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i16 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i16 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i16 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i16 -; AVX256NODQ-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i16> +; AVX256NODQ-NEXT: [[TMP6:%.*]] = fptoui <2 x double> [[TMP2]] to <2 x i16> +; AVX256NODQ-NEXT: [[TMP7:%.*]] = fptoui <2 x double> [[TMP3]] to <2 x i16> +; AVX256NODQ-NEXT: [[TMP8:%.*]] = fptoui <2 x double> [[TMP4]] to <2 x i16> +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP7]], i32 0 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4 +; AVX256NODQ-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5 +; AVX256NODQ-NEXT: [[TMP21:%.*]] = extractelement <2 x i16> [[TMP8]], i32 0 +; AVX256NODQ-NEXT: [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6 +; AVX256NODQ-NEXT: [[TMP23:%.*]] = extractelement <2 x i16> [[TMP8]], i32 1 +; AVX256NODQ-NEXT: [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7 +; AVX256NODQ-NEXT: store <8 x i16> [[TMP24]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f64_8i16( @@ -381,14 +392,18 @@ ; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i64 ; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i64 ; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> undef, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f32_8i64( @@ -408,14 +423,16 @@ ; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i64 ; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i64 ; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> undef, i64 [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP8]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f32_8i64( @@ -478,14 +495,16 @@ ; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 ; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CVT1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CVT2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CVT3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CVT5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CVT6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CVT7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f32_8i32( @@ -505,14 +524,15 @@ ; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 ; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 ; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CVT4]], i32 4 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CVT5]], i32 5 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CVT6]], i32 6 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CVT7]], i32 7 +; AVX256NODQ-NEXT: store <8 x i32> [[TMP8]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f32_8i32( @@ -572,14 +592,15 @@ ; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i16 ; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i16 ; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i16 -; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> undef, i16 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[CVT1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[CVT2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[CVT3]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[CVT4]], i32 4 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CVT5]], i32 5 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[CVT6]], i32 6 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[CVT7]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @fptoui_8f32_8i16( Index: test/Transforms/SLPVectorizer/X86/fround.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fround.ll +++ test/Transforms/SLPVectorizer/X86/fround.ll @@ -31,8 +31,9 @@ ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 ; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) ; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CEIL1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_2f64( @@ -66,10 +67,12 @@ ; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) ; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) ; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CEIL1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CEIL2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CEIL3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_4f64( @@ -120,14 +123,18 @@ ; SSE2-NEXT: [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]]) ; SSE2-NEXT: [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]]) ; SSE2-NEXT: [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CEIL1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CEIL2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CEIL3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CEIL4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CEIL5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CEIL6]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CEIL7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_8f64( @@ -202,8 +209,9 @@ ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 ; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) ; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FLOOR1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_2f64( @@ -237,10 +245,12 @@ ; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) ; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) ; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FLOOR1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[FLOOR2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[FLOOR3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_4f64( @@ -291,14 +301,18 @@ ; SSE2-NEXT: [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]]) ; SSE2-NEXT: [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]]) ; SSE2-NEXT: [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FLOOR1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[FLOOR2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[FLOOR3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[FLOOR4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FLOOR5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[FLOOR6]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[FLOOR7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_8f64( @@ -373,8 +387,9 @@ ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 ; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[NEARBYINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_2f64( @@ -408,10 +423,12 @@ ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) ; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) ; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[NEARBYINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[NEARBYINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_4f64( @@ -462,14 +479,18 @@ ; SSE2-NEXT: [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]]) ; SSE2-NEXT: [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]]) ; SSE2-NEXT: [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[NEARBYINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[NEARBYINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[NEARBYINT5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[NEARBYINT6]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[NEARBYINT7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_8f64( @@ -544,8 +565,9 @@ ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 ; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) ; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[RINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_2f64( @@ -579,10 +601,12 @@ ; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) ; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) ; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[RINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[RINT2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[RINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_4f64( @@ -633,14 +657,18 @@ ; SSE2-NEXT: [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]]) ; SSE2-NEXT: [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]]) ; SSE2-NEXT: [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[RINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[RINT2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[RINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[RINT4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RINT5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[RINT6]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[RINT7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_8f64( @@ -715,8 +743,9 @@ ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 ; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) ; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TRUNC1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_2f64( @@ -750,10 +779,12 @@ ; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) ; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) ; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TRUNC1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TRUNC2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TRUNC3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_4f64( @@ -804,14 +835,18 @@ ; SSE2-NEXT: [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]]) ; SSE2-NEXT: [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]]) ; SSE2-NEXT: [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TRUNC1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TRUNC2]], i32 0 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TRUNC3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[TRUNC4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[TRUNC5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TRUNC6]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TRUNC7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_8f64( @@ -890,10 +925,11 @@ ; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) ; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) ; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CEIL1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CEIL2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CEIL3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_4f32( @@ -941,14 +977,16 @@ ; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) ; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) ; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CEIL1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CEIL2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CEIL3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CEIL4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CEIL5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CEIL6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CEIL7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_8f32( @@ -1027,22 +1065,26 @@ ; SSE2-NEXT: [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]]) ; SSE2-NEXT: [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]]) ; SSE2-NEXT: [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CEIL1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CEIL2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CEIL3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CEIL4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CEIL5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CEIL6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CEIL7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[CEIL8]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[CEIL9]], i32 1 +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[CEIL10]], i32 2 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CEIL11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[CEIL12]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CEIL13]], i32 1 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[CEIL14]], i32 2 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CEIL15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_16f32( @@ -1145,10 +1187,11 @@ ; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) ; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) ; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FLOOR1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[FLOOR2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[FLOOR3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_4f32( @@ -1196,14 +1239,16 @@ ; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) ; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) ; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FLOOR1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[FLOOR2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[FLOOR3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[FLOOR4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[FLOOR5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[FLOOR6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[FLOOR7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_8f32( @@ -1282,22 +1327,26 @@ ; SSE2-NEXT: [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]]) ; SSE2-NEXT: [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]]) ; SSE2-NEXT: [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FLOOR1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[FLOOR2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[FLOOR3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[FLOOR4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[FLOOR5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[FLOOR6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[FLOOR7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[FLOOR8]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[FLOOR9]], i32 1 +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[FLOOR10]], i32 2 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[FLOOR11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[FLOOR12]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[FLOOR13]], i32 1 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[FLOOR14]], i32 2 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[FLOOR15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_16f32( @@ -1400,10 +1449,11 @@ ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) ; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) ; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[NEARBYINT1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[NEARBYINT2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[NEARBYINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_4f32( @@ -1451,14 +1501,16 @@ ; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) ; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) ; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[NEARBYINT1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[NEARBYINT2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[NEARBYINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[NEARBYINT5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[NEARBYINT6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[NEARBYINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_8f32( @@ -1537,22 +1589,26 @@ ; SSE2-NEXT: [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]]) ; SSE2-NEXT: [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]]) ; SSE2-NEXT: [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[NEARBYINT1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[NEARBYINT2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[NEARBYINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[NEARBYINT5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[NEARBYINT6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[NEARBYINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT8]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[NEARBYINT9]], i32 1 +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[NEARBYINT10]], i32 2 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[NEARBYINT11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[NEARBYINT12]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[NEARBYINT13]], i32 1 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[NEARBYINT14]], i32 2 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[NEARBYINT15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_16f32( @@ -1655,10 +1711,11 @@ ; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) ; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) ; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[RINT1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[RINT2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[RINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_4f32( @@ -1706,14 +1763,16 @@ ; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) ; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) ; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[RINT1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[RINT2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[RINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[RINT4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[RINT5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[RINT6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[RINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_8f32( @@ -1792,22 +1851,26 @@ ; SSE2-NEXT: [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]]) ; SSE2-NEXT: [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]]) ; SSE2-NEXT: [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[RINT1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[RINT2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[RINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[RINT4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[RINT5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[RINT6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[RINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[RINT8]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[RINT9]], i32 1 +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[RINT10]], i32 2 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[RINT11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[RINT12]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[RINT13]], i32 1 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[RINT14]], i32 2 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[RINT15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_16f32( @@ -1910,10 +1973,11 @@ ; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) ; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) ; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TRUNC1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TRUNC2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TRUNC3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_4f32( @@ -1961,14 +2025,16 @@ ; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) ; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) ; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TRUNC1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TRUNC2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TRUNC3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[TRUNC4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TRUNC5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TRUNC6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TRUNC7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_8f32( @@ -2047,22 +2113,26 @@ ; SSE2-NEXT: [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]]) ; SSE2-NEXT: [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]]) ; SSE2-NEXT: [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TRUNC1]], i32 1 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TRUNC2]], i32 2 +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TRUNC3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[TRUNC4]], i32 0 +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TRUNC5]], i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TRUNC6]], i32 2 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TRUNC7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TRUNC8]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TRUNC9]], i32 1 +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TRUNC10]], i32 2 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TRUNC11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[TRUNC12]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TRUNC13]], i32 1 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TRUNC14]], i32 2 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TRUNC15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_16f32( Index: test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/powof2div.ll +++ test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -60,35 +60,39 @@ define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ ; AVX1-LABEL: @powof2div_nonuniform( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 -; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 -; AVX1-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; AVX1-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; AVX1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] -; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; AVX1-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 ; AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; AVX1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 ; AVX1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; AVX1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; AVX1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 ; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; AVX1-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 ; AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; AVX1-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; AVX1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; AVX1-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 -; AVX1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] -; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 +; AVX1-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TMP12]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; AVX1-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP16]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP18]], i32 3 +; AVX1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP19]] +; AVX1-NEXT: [[TMP21:%.*]] = sdiv <4 x i32> [[TMP20]], ; AVX1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; AVX1-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 +; AVX1-NEXT: [[TMP22:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; AVX1-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP22]], align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @powof2div_nonuniform( Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -15,32 +15,59 @@ ; Vector cost is 6, Scalar cost is 7 ; SSE: Adding cost -1 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction) define i32 @test_add(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_add( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = add i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_add( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = add i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_add( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = add i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -136,32 +163,59 @@ ; } define i32 @test_and(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_and( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = and i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = and i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = and i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = and i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = and i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = and i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = and i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_and( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = and i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = and i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = and i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = and i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = and i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = and i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = and i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_and( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = and i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = and i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = and i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = and i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = and i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = and i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = and i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -197,32 +251,59 @@ ; } define i32 @test_or(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_or( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = or i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = or i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = or i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = or i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = or i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = or i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = or i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_or( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = or i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = or i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = or i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = or i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = or i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = or i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = or i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_or( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = or i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = or i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = or i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = or i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = or i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = or i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = or i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -258,32 +339,59 @@ ; } define i32 @test_xor(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_xor( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = xor i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_xor( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = xor i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_xor( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = xor i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -312,25 +420,45 @@ } define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 { -; CHECK-LABEL: @PR37731( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = xor i32 undef, undef -; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP7]], undef -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], undef -; CHECK-NEXT: ret i32 [[TMP9]] +; AVX-LABEL: @PR37731( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16 +; AVX-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; AVX-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]] +; AVX-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], +; AVX-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP0]], +; AVX-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]] +; AVX-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16 +; AVX-NEXT: [[TMP7:%.*]] = xor i32 undef, undef +; AVX-NEXT: [[TMP8:%.*]] = xor i32 [[TMP7]], undef +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; AVX-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], undef +; AVX-NEXT: ret i32 [[TMP9]] +; +; SSE-LABEL: @PR37731( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16 +; SSE-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; SSE-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]] +; SSE-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP0]], +; SSE-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], +; SSE-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16 +; SSE-NEXT: [[TMP7:%.*]] = xor i32 undef, undef +; SSE-NEXT: [[TMP8:%.*]] = xor i32 [[TMP7]], undef +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], undef +; SSE-NEXT: ret i32 [[TMP9]] ; entry: %0 = load <4 x i32>, <4 x i32>* %self, align 16 Index: test/Transforms/SLPVectorizer/X86/shift-ashr.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -22,73 +22,97 @@ define void @ashr_v8i64() { ; SSE-LABEL: @ashr_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]] -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = ashr <2 x i64> [[TMP12]], [[TMP16]] +; SSE-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> undef, i64 [[TMP18]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP20]], i32 1 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> undef, i64 [[TMP22]], i32 0 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; SSE-NEXT: [[TMP26:%.*]] = ashr <2 x i64> [[TMP21]], [[TMP25]] +; SSE-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x i64> undef, i64 [[TMP27]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP28]], i64 [[TMP29]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> undef, i64 [[TMP31]], i32 0 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP33]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = ashr <2 x i64> [[TMP30]], [[TMP34]] +; SSE-NEXT: [[TMP36:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <2 x i64> undef, i64 [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> [[TMP37]], i64 [[TMP38]], i32 1 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <2 x i64> undef, i64 [[TMP40]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <2 x i64> [[TMP41]], i64 [[TMP42]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = ashr <2 x i64> [[TMP39]], [[TMP43]] +; SSE-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP44]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @ashr_v8i64( -; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]] -; AVX1-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]] -; AVX1-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]] -; AVX1-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]] -; AVX1-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]] -; AVX1-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]] -; AVX1-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]] -; AVX1-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]] -; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2 +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3 +; AVX1-NEXT: [[TMP21:%.*]] = ashr <4 x i64> [[TMP12]], [[TMP20]] +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3 +; AVX1-NEXT: [[TMP38:%.*]] = ashr <4 x i64> [[TMP29]], [[TMP37]] +; AVX1-NEXT: store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX1-NEXT: store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ashr_v8i64( @@ -157,89 +181,161 @@ define void @ashr_v16i32() { ; SSE-LABEL: @ashr_v16i32( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = ashr i32 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = ashr i32 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = ashr i32 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = ashr i32 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = ashr i32 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = ashr i32 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = ashr i32 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = ashr i32 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = ashr i32 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = ashr i32 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = ashr i32 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = ashr i32 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = ashr i32 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = ashr i32 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = ashr i32 [[A15]], [[B15]] -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 3 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> undef, i32 [[TMP17]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP19]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i32 2 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP23]], i32 3 +; SSE-NEXT: [[TMP25:%.*]] = ashr <4 x i32> [[TMP16]], [[TMP24]] +; SSE-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> undef, i32 [[TMP26]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP29:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP28]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP30]], i32 2 +; SSE-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP32]], i32 3 +; SSE-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP34]], i32 0 +; SSE-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP36]], i32 1 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP38]], i32 2 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP40]], i32 3 +; SSE-NEXT: [[TMP42:%.*]] = ashr <4 x i32> [[TMP33]], [[TMP41]] +; SSE-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> undef, i32 [[TMP43]], i32 0 +; SSE-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP45]], i32 1 +; SSE-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP47]], i32 2 +; SSE-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP49]], i32 3 +; SSE-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> undef, i32 [[TMP51]], i32 0 +; SSE-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP54:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP53]], i32 1 +; SSE-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 +; SSE-NEXT: [[TMP56:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP55]], i32 2 +; SSE-NEXT: [[TMP57:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 +; SSE-NEXT: [[TMP58:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP57]], i32 3 +; SSE-NEXT: [[TMP59:%.*]] = ashr <4 x i32> [[TMP50]], [[TMP58]] +; SSE-NEXT: [[TMP60:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP61:%.*]] = insertelement <4 x i32> undef, i32 [[TMP60]], i32 0 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP63:%.*]] = insertelement <4 x i32> [[TMP61]], i32 [[TMP62]], i32 1 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP65:%.*]] = insertelement <4 x i32> [[TMP63]], i32 [[TMP64]], i32 2 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> [[TMP65]], i32 [[TMP66]], i32 3 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> undef, i32 [[TMP68]], i32 0 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP70]], i32 1 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; SSE-NEXT: [[TMP73:%.*]] = insertelement <4 x i32> [[TMP71]], i32 [[TMP72]], i32 2 +; SSE-NEXT: [[TMP74:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; SSE-NEXT: [[TMP75:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP74]], i32 3 +; SSE-NEXT: [[TMP76:%.*]] = ashr <4 x i32> [[TMP67]], [[TMP75]] +; SSE-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP42]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP59]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP76]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @ashr_v16i32( -; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP7]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP11]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[TMP13]], i32 4 +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP15]], i32 5 +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP17]], i32 6 +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; AVX1-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 7 +; AVX1-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> undef, i32 [[TMP21]], i32 0 +; AVX1-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 1 +; AVX1-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP25]], i32 2 +; AVX1-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP27]], i32 3 +; AVX1-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; AVX1-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP29]], i32 4 +; AVX1-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; AVX1-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP31]], i32 5 +; AVX1-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; AVX1-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 6 +; AVX1-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; AVX1-NEXT: [[TMP36:%.*]] = insertelement <8 x i32> [[TMP34]], i32 [[TMP35]], i32 7 +; AVX1-NEXT: [[TMP37:%.*]] = ashr <8 x i32> [[TMP20]], [[TMP36]] +; AVX1-NEXT: [[TMP38:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP39:%.*]] = insertelement <8 x i32> undef, i32 [[TMP38]], i32 0 +; AVX1-NEXT: [[TMP40:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP41:%.*]] = insertelement <8 x i32> [[TMP39]], i32 [[TMP40]], i32 1 +; AVX1-NEXT: [[TMP42:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP43:%.*]] = insertelement <8 x i32> [[TMP41]], i32 [[TMP42]], i32 2 +; AVX1-NEXT: [[TMP44:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP45:%.*]] = insertelement <8 x i32> [[TMP43]], i32 [[TMP44]], i32 3 +; AVX1-NEXT: [[TMP46:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX1-NEXT: [[TMP47:%.*]] = insertelement <8 x i32> [[TMP45]], i32 [[TMP46]], i32 4 +; AVX1-NEXT: [[TMP48:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX1-NEXT: [[TMP49:%.*]] = insertelement <8 x i32> [[TMP47]], i32 [[TMP48]], i32 5 +; AVX1-NEXT: [[TMP50:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6 +; AVX1-NEXT: [[TMP51:%.*]] = insertelement <8 x i32> [[TMP49]], i32 [[TMP50]], i32 6 +; AVX1-NEXT: [[TMP52:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7 +; AVX1-NEXT: [[TMP53:%.*]] = insertelement <8 x i32> [[TMP51]], i32 [[TMP52]], i32 7 +; AVX1-NEXT: [[TMP54:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP55:%.*]] = insertelement <8 x i32> undef, i32 [[TMP54]], i32 0 +; AVX1-NEXT: [[TMP56:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP57:%.*]] = insertelement <8 x i32> [[TMP55]], i32 [[TMP56]], i32 1 +; AVX1-NEXT: [[TMP58:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP59:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP58]], i32 2 +; AVX1-NEXT: [[TMP60:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP61:%.*]] = insertelement <8 x i32> [[TMP59]], i32 [[TMP60]], i32 3 +; AVX1-NEXT: [[TMP62:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4 +; AVX1-NEXT: [[TMP63:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP62]], i32 4 +; AVX1-NEXT: [[TMP64:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5 +; AVX1-NEXT: [[TMP65:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP64]], i32 5 +; AVX1-NEXT: [[TMP66:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6 +; AVX1-NEXT: [[TMP67:%.*]] = insertelement <8 x i32> [[TMP65]], i32 [[TMP66]], i32 6 +; AVX1-NEXT: [[TMP68:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7 +; AVX1-NEXT: [[TMP69:%.*]] = insertelement <8 x i32> [[TMP67]], i32 [[TMP68]], i32 7 +; AVX1-NEXT: [[TMP70:%.*]] = ashr <8 x i32> [[TMP53]], [[TMP69]] +; AVX1-NEXT: store <8 x i32> [[TMP37]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX1-NEXT: store <8 x i32> [[TMP70]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ashr_v16i32( @@ -340,134 +436,150 @@ define void @ashr_v32i16() { ; SSE-LABEL: @ashr_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = ashr i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = ashr i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = ashr i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = ashr i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = ashr i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = ashr i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = ashr i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = ashr i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = ashr i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = ashr i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = ashr i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = ashr i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = ashr i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = ashr i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = ashr i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = ashr i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = ashr i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = ashr i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = ashr i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = ashr i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = ashr i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = ashr i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = ashr i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = ashr i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = ashr i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = ashr i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = ashr i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = ashr i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = ashr i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = ashr i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = ashr i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = ashr i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7 +; SSE-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <8 x i16> undef, i16 [[TMP25]], i32 0 +; SSE-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <8 x i16> [[TMP26]], i16 [[TMP27]], i32 1 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[TMP29]], i32 2 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <8 x i16> [[TMP30]], i16 [[TMP31]], i32 3 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[TMP33]], i32 4 +; SSE-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; SSE-NEXT: [[TMP36:%.*]] = insertelement <8 x i16> [[TMP34]], i16 [[TMP35]], i32 5 +; SSE-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; SSE-NEXT: [[TMP38:%.*]] = insertelement <8 x i16> [[TMP36]], i16 [[TMP37]], i32 6 +; SSE-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; SSE-NEXT: [[TMP40:%.*]] = insertelement <8 x i16> [[TMP38]], i16 [[TMP39]], i32 7 +; SSE-NEXT: [[TMP41:%.*]] = ashr <8 x i16> [[TMP24]], [[TMP40]] +; SSE-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <8 x i16> undef, i16 [[TMP42]], i32 0 +; SSE-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <8 x i16> [[TMP43]], i16 [[TMP44]], i32 1 +; SSE-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <8 x i16> [[TMP45]], i16 [[TMP46]], i32 2 +; SSE-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <8 x i16> [[TMP47]], i16 [[TMP48]], i32 3 +; SSE-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <8 x i16> [[TMP49]], i16 [[TMP50]], i32 4 +; SSE-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; SSE-NEXT: [[TMP53:%.*]] = insertelement <8 x i16> [[TMP51]], i16 [[TMP52]], i32 5 +; SSE-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; SSE-NEXT: [[TMP55:%.*]] = insertelement <8 x i16> [[TMP53]], i16 [[TMP54]], i32 6 +; SSE-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; SSE-NEXT: [[TMP57:%.*]] = insertelement <8 x i16> [[TMP55]], i16 [[TMP56]], i32 7 +; SSE-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP59:%.*]] = insertelement <8 x i16> undef, i16 [[TMP58]], i32 0 +; SSE-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP61:%.*]] = insertelement <8 x i16> [[TMP59]], i16 [[TMP60]], i32 1 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; SSE-NEXT: [[TMP63:%.*]] = insertelement <8 x i16> [[TMP61]], i16 [[TMP62]], i32 2 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; SSE-NEXT: [[TMP65:%.*]] = insertelement <8 x i16> [[TMP63]], i16 [[TMP64]], i32 3 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; SSE-NEXT: [[TMP67:%.*]] = insertelement <8 x i16> [[TMP65]], i16 [[TMP66]], i32 4 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; SSE-NEXT: [[TMP69:%.*]] = insertelement <8 x i16> [[TMP67]], i16 [[TMP68]], i32 5 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; SSE-NEXT: [[TMP71:%.*]] = insertelement <8 x i16> [[TMP69]], i16 [[TMP70]], i32 6 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; SSE-NEXT: [[TMP73:%.*]] = insertelement <8 x i16> [[TMP71]], i16 [[TMP72]], i32 7 +; SSE-NEXT: [[TMP74:%.*]] = ashr <8 x i16> [[TMP57]], [[TMP73]] +; SSE-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP76:%.*]] = insertelement <8 x i16> undef, i16 [[TMP75]], i32 0 +; SSE-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP78:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[TMP77]], i32 1 +; SSE-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP80:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[TMP79]], i32 2 +; SSE-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP82:%.*]] = insertelement <8 x i16> [[TMP80]], i16 [[TMP81]], i32 3 +; SSE-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[TMP84:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[TMP83]], i32 4 +; SSE-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[TMP86:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[TMP85]], i32 5 +; SSE-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[TMP88:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[TMP87]], i32 6 +; SSE-NEXT: [[TMP89:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[TMP90:%.*]] = insertelement <8 x i16> [[TMP88]], i16 [[TMP89]], i32 7 +; SSE-NEXT: [[TMP91:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP92:%.*]] = insertelement <8 x i16> undef, i16 [[TMP91]], i32 0 +; SSE-NEXT: [[TMP93:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP94:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[TMP93]], i32 1 +; SSE-NEXT: [[TMP95:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; SSE-NEXT: [[TMP96:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[TMP95]], i32 2 +; SSE-NEXT: [[TMP97:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; SSE-NEXT: [[TMP98:%.*]] = insertelement <8 x i16> [[TMP96]], i16 [[TMP97]], i32 3 +; SSE-NEXT: [[TMP99:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; SSE-NEXT: [[TMP100:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[TMP99]], i32 4 +; SSE-NEXT: [[TMP101:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; SSE-NEXT: [[TMP102:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[TMP101]], i32 5 +; SSE-NEXT: [[TMP103:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; SSE-NEXT: [[TMP104:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[TMP103]], i32 6 +; SSE-NEXT: [[TMP105:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; SSE-NEXT: [[TMP106:%.*]] = insertelement <8 x i16> [[TMP104]], i16 [[TMP105]], i32 7 +; SSE-NEXT: [[TMP107:%.*]] = ashr <8 x i16> [[TMP90]], [[TMP106]] +; SSE-NEXT: [[TMP108:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP109:%.*]] = insertelement <8 x i16> undef, i16 [[TMP108]], i32 0 +; SSE-NEXT: [[TMP110:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP111:%.*]] = insertelement <8 x i16> [[TMP109]], i16 [[TMP110]], i32 1 +; SSE-NEXT: [[TMP112:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP113:%.*]] = insertelement <8 x i16> [[TMP111]], i16 [[TMP112]], i32 2 +; SSE-NEXT: [[TMP114:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP115:%.*]] = insertelement <8 x i16> [[TMP113]], i16 [[TMP114]], i32 3 +; SSE-NEXT: [[TMP116:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; SSE-NEXT: [[TMP117:%.*]] = insertelement <8 x i16> [[TMP115]], i16 [[TMP116]], i32 4 +; SSE-NEXT: [[TMP118:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; SSE-NEXT: [[TMP119:%.*]] = insertelement <8 x i16> [[TMP117]], i16 [[TMP118]], i32 5 +; SSE-NEXT: [[TMP120:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; SSE-NEXT: [[TMP121:%.*]] = insertelement <8 x i16> [[TMP119]], i16 [[TMP120]], i32 6 +; SSE-NEXT: [[TMP122:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; SSE-NEXT: [[TMP123:%.*]] = insertelement <8 x i16> [[TMP121]], i16 [[TMP122]], i32 7 +; SSE-NEXT: [[TMP124:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP125:%.*]] = insertelement <8 x i16> undef, i16 [[TMP124]], i32 0 +; SSE-NEXT: [[TMP126:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP127:%.*]] = insertelement <8 x i16> [[TMP125]], i16 [[TMP126]], i32 1 +; SSE-NEXT: [[TMP128:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2 +; SSE-NEXT: [[TMP129:%.*]] = insertelement <8 x i16> [[TMP127]], i16 [[TMP128]], i32 2 +; SSE-NEXT: [[TMP130:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3 +; SSE-NEXT: [[TMP131:%.*]] = insertelement <8 x i16> [[TMP129]], i16 [[TMP130]], i32 3 +; SSE-NEXT: [[TMP132:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4 +; SSE-NEXT: [[TMP133:%.*]] = insertelement <8 x i16> [[TMP131]], i16 [[TMP132]], i32 4 +; SSE-NEXT: [[TMP134:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5 +; SSE-NEXT: [[TMP135:%.*]] = insertelement <8 x i16> [[TMP133]], i16 [[TMP134]], i32 5 +; SSE-NEXT: [[TMP136:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +; SSE-NEXT: [[TMP137:%.*]] = insertelement <8 x i16> [[TMP135]], i16 [[TMP136]], i32 6 +; SSE-NEXT: [[TMP138:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +; SSE-NEXT: [[TMP139:%.*]] = insertelement <8 x i16> [[TMP137]], i16 [[TMP138]], i32 7 +; SSE-NEXT: [[TMP140:%.*]] = ashr <8 x i16> [[TMP123]], [[TMP139]] +; SSE-NEXT: store <8 x i16> [[TMP41]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP74]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP107]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP140]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @ashr_v32i16( Index: test/Transforms/SLPVectorizer/X86/shift-lshr.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -41,22 +41,46 @@ ; SSE-NEXT: ret void ; ; AVX1-LABEL: @lshr_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2 +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3 +; AVX1-NEXT: [[TMP21:%.*]] = lshr <4 x i64> [[TMP12]], [[TMP20]] +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3 +; AVX1-NEXT: [[TMP38:%.*]] = lshr <4 x i64> [[TMP29]], [[TMP37]] +; AVX1-NEXT: store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX1-NEXT: store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @lshr_v8i64( @@ -125,70 +149,86 @@ define void @lshr_v16i32() { ; SSE-LABEL: @lshr_v16i32( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = lshr i32 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = lshr i32 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = lshr i32 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = lshr i32 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = lshr i32 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = lshr i32 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = lshr i32 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = lshr i32 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = lshr i32 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = lshr i32 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = lshr i32 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = lshr i32 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = lshr i32 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = lshr i32 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = lshr i32 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = lshr i32 [[A15]], [[B15]] -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 3 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> undef, i32 [[TMP17]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP19]], i32 1 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i32 2 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP23]], i32 3 +; SSE-NEXT: [[TMP25:%.*]] = lshr <4 x i32> [[TMP16]], [[TMP24]] +; SSE-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> undef, i32 [[TMP26]], i32 0 +; SSE-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP29:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP28]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP30]], i32 2 +; SSE-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP32]], i32 3 +; SSE-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP34]], i32 0 +; SSE-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP36]], i32 1 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2 +; SSE-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP38]], i32 2 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP40]], i32 3 +; SSE-NEXT: [[TMP42:%.*]] = lshr <4 x i32> [[TMP33]], [[TMP41]] +; SSE-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> undef, i32 [[TMP43]], i32 0 +; SSE-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP45]], i32 1 +; SSE-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP47]], i32 2 +; SSE-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP49]], i32 3 +; SSE-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> undef, i32 [[TMP51]], i32 0 +; SSE-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP54:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP53]], i32 1 +; SSE-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 +; SSE-NEXT: [[TMP56:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP55]], i32 2 +; SSE-NEXT: [[TMP57:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 +; SSE-NEXT: [[TMP58:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP57]], i32 3 +; SSE-NEXT: [[TMP59:%.*]] = lshr <4 x i32> [[TMP50]], [[TMP58]] +; SSE-NEXT: [[TMP60:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP61:%.*]] = insertelement <4 x i32> undef, i32 [[TMP60]], i32 0 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP63:%.*]] = insertelement <4 x i32> [[TMP61]], i32 [[TMP62]], i32 1 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP65:%.*]] = insertelement <4 x i32> [[TMP63]], i32 [[TMP64]], i32 2 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> [[TMP65]], i32 [[TMP66]], i32 3 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> undef, i32 [[TMP68]], i32 0 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP70]], i32 1 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; SSE-NEXT: [[TMP73:%.*]] = insertelement <4 x i32> [[TMP71]], i32 [[TMP72]], i32 2 +; SSE-NEXT: [[TMP74:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; SSE-NEXT: [[TMP75:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP74]], i32 3 +; SSE-NEXT: [[TMP76:%.*]] = lshr <4 x i32> [[TMP67]], [[TMP75]] +; SSE-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP42]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP59]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP76]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v16i32( @@ -289,134 +329,150 @@ define void @lshr_v32i16() { ; SSE-LABEL: @lshr_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = lshr i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = lshr i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = lshr i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = lshr i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = lshr i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = lshr i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = lshr i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = lshr i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = lshr i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = lshr i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = lshr i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = lshr i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = lshr i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = lshr i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = lshr i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = lshr i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = lshr i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = lshr i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = lshr i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = lshr i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = lshr i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = lshr i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = lshr i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = lshr i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = lshr i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = lshr i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = lshr i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = lshr i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = lshr i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = lshr i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = lshr i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = lshr i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7 +; SSE-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <8 x i16> undef, i16 [[TMP25]], i32 0 +; SSE-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <8 x i16> [[TMP26]], i16 [[TMP27]], i32 1 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[TMP29]], i32 2 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <8 x i16> [[TMP30]], i16 [[TMP31]], i32 3 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[TMP33]], i32 4 +; SSE-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; SSE-NEXT: [[TMP36:%.*]] = insertelement <8 x i16> [[TMP34]], i16 [[TMP35]], i32 5 +; SSE-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; SSE-NEXT: [[TMP38:%.*]] = insertelement <8 x i16> [[TMP36]], i16 [[TMP37]], i32 6 +; SSE-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; SSE-NEXT: [[TMP40:%.*]] = insertelement <8 x i16> [[TMP38]], i16 [[TMP39]], i32 7 +; SSE-NEXT: [[TMP41:%.*]] = lshr <8 x i16> [[TMP24]], [[TMP40]] +; SSE-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <8 x i16> undef, i16 [[TMP42]], i32 0 +; SSE-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <8 x i16> [[TMP43]], i16 [[TMP44]], i32 1 +; SSE-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <8 x i16> [[TMP45]], i16 [[TMP46]], i32 2 +; SSE-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <8 x i16> [[TMP47]], i16 [[TMP48]], i32 3 +; SSE-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <8 x i16> [[TMP49]], i16 [[TMP50]], i32 4 +; SSE-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; SSE-NEXT: [[TMP53:%.*]] = insertelement <8 x i16> [[TMP51]], i16 [[TMP52]], i32 5 +; SSE-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; SSE-NEXT: [[TMP55:%.*]] = insertelement <8 x i16> [[TMP53]], i16 [[TMP54]], i32 6 +; SSE-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; SSE-NEXT: [[TMP57:%.*]] = insertelement <8 x i16> [[TMP55]], i16 [[TMP56]], i32 7 +; SSE-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP59:%.*]] = insertelement <8 x i16> undef, i16 [[TMP58]], i32 0 +; SSE-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP61:%.*]] = insertelement <8 x i16> [[TMP59]], i16 [[TMP60]], i32 1 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; SSE-NEXT: [[TMP63:%.*]] = insertelement <8 x i16> [[TMP61]], i16 [[TMP62]], i32 2 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; SSE-NEXT: [[TMP65:%.*]] = insertelement <8 x i16> [[TMP63]], i16 [[TMP64]], i32 3 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; SSE-NEXT: [[TMP67:%.*]] = insertelement <8 x i16> [[TMP65]], i16 [[TMP66]], i32 4 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; SSE-NEXT: [[TMP69:%.*]] = insertelement <8 x i16> [[TMP67]], i16 [[TMP68]], i32 5 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; SSE-NEXT: [[TMP71:%.*]] = insertelement <8 x i16> [[TMP69]], i16 [[TMP70]], i32 6 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; SSE-NEXT: [[TMP73:%.*]] = insertelement <8 x i16> [[TMP71]], i16 [[TMP72]], i32 7 +; SSE-NEXT: [[TMP74:%.*]] = lshr <8 x i16> [[TMP57]], [[TMP73]] +; SSE-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP76:%.*]] = insertelement <8 x i16> undef, i16 [[TMP75]], i32 0 +; SSE-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP78:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[TMP77]], i32 1 +; SSE-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP80:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[TMP79]], i32 2 +; SSE-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP82:%.*]] = insertelement <8 x i16> [[TMP80]], i16 [[TMP81]], i32 3 +; SSE-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[TMP84:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[TMP83]], i32 4 +; SSE-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[TMP86:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[TMP85]], i32 5 +; SSE-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[TMP88:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[TMP87]], i32 6 +; SSE-NEXT: [[TMP89:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[TMP90:%.*]] = insertelement <8 x i16> [[TMP88]], i16 [[TMP89]], i32 7 +; SSE-NEXT: [[TMP91:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP92:%.*]] = insertelement <8 x i16> undef, i16 [[TMP91]], i32 0 +; SSE-NEXT: [[TMP93:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP94:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[TMP93]], i32 1 +; SSE-NEXT: [[TMP95:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; SSE-NEXT: [[TMP96:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[TMP95]], i32 2 +; SSE-NEXT: [[TMP97:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; SSE-NEXT: [[TMP98:%.*]] = insertelement <8 x i16> [[TMP96]], i16 [[TMP97]], i32 3 +; SSE-NEXT: [[TMP99:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; SSE-NEXT: [[TMP100:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[TMP99]], i32 4 +; SSE-NEXT: [[TMP101:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; SSE-NEXT: [[TMP102:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[TMP101]], i32 5 +; SSE-NEXT: [[TMP103:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; SSE-NEXT: [[TMP104:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[TMP103]], i32 6 +; SSE-NEXT: [[TMP105:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; SSE-NEXT: [[TMP106:%.*]] = insertelement <8 x i16> [[TMP104]], i16 [[TMP105]], i32 7 +; SSE-NEXT: [[TMP107:%.*]] = lshr <8 x i16> [[TMP90]], [[TMP106]] +; SSE-NEXT: [[TMP108:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP109:%.*]] = insertelement <8 x i16> undef, i16 [[TMP108]], i32 0 +; SSE-NEXT: [[TMP110:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP111:%.*]] = insertelement <8 x i16> [[TMP109]], i16 [[TMP110]], i32 1 +; SSE-NEXT: [[TMP112:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP113:%.*]] = insertelement <8 x i16> [[TMP111]], i16 [[TMP112]], i32 2 +; SSE-NEXT: [[TMP114:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP115:%.*]] = insertelement <8 x i16> [[TMP113]], i16 [[TMP114]], i32 3 +; SSE-NEXT: [[TMP116:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; SSE-NEXT: [[TMP117:%.*]] = insertelement <8 x i16> [[TMP115]], i16 [[TMP116]], i32 4 +; SSE-NEXT: [[TMP118:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; SSE-NEXT: [[TMP119:%.*]] = insertelement <8 x i16> [[TMP117]], i16 [[TMP118]], i32 5 +; SSE-NEXT: [[TMP120:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; SSE-NEXT: [[TMP121:%.*]] = insertelement <8 x i16> [[TMP119]], i16 [[TMP120]], i32 6 +; SSE-NEXT: [[TMP122:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; SSE-NEXT: [[TMP123:%.*]] = insertelement <8 x i16> [[TMP121]], i16 [[TMP122]], i32 7 +; SSE-NEXT: [[TMP124:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP125:%.*]] = insertelement <8 x i16> undef, i16 [[TMP124]], i32 0 +; SSE-NEXT: [[TMP126:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP127:%.*]] = insertelement <8 x i16> [[TMP125]], i16 [[TMP126]], i32 1 +; SSE-NEXT: [[TMP128:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2 +; SSE-NEXT: [[TMP129:%.*]] = insertelement <8 x i16> [[TMP127]], i16 [[TMP128]], i32 2 +; SSE-NEXT: [[TMP130:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3 +; SSE-NEXT: [[TMP131:%.*]] = insertelement <8 x i16> [[TMP129]], i16 [[TMP130]], i32 3 +; SSE-NEXT: [[TMP132:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4 +; SSE-NEXT: [[TMP133:%.*]] = insertelement <8 x i16> [[TMP131]], i16 [[TMP132]], i32 4 +; SSE-NEXT: [[TMP134:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5 +; SSE-NEXT: [[TMP135:%.*]] = insertelement <8 x i16> [[TMP133]], i16 [[TMP134]], i32 5 +; SSE-NEXT: [[TMP136:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +; SSE-NEXT: [[TMP137:%.*]] = insertelement <8 x i16> [[TMP135]], i16 [[TMP136]], i32 6 +; SSE-NEXT: [[TMP138:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +; SSE-NEXT: [[TMP139:%.*]] = insertelement <8 x i16> [[TMP137]], i16 [[TMP138]], i32 7 +; SSE-NEXT: [[TMP140:%.*]] = lshr <8 x i16> [[TMP123]], [[TMP139]] +; SSE-NEXT: store <8 x i16> [[TMP41]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP74]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP107]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP140]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v32i16( Index: test/Transforms/SLPVectorizer/X86/shift-shl.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -41,22 +41,46 @@ ; SSE-NEXT: ret void ; ; AVX1-LABEL: @shl_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> undef, i64 [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP7]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP13]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP15]], i32 1 +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP17]], i32 2 +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i32 3 +; AVX1-NEXT: [[TMP21:%.*]] = shl <4 x i64> [[TMP12]], [[TMP20]] +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> undef, i64 [[TMP22]], i32 0 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP25:%.*]] = insertelement <4 x i64> [[TMP23]], i64 [[TMP24]], i32 1 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP25]], i64 [[TMP26]], i32 2 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP29:%.*]] = insertelement <4 x i64> [[TMP27]], i64 [[TMP28]], i32 3 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> undef, i64 [[TMP30]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP32]], i32 1 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP34]], i32 2 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP36]], i32 3 +; AVX1-NEXT: [[TMP38:%.*]] = shl <4 x i64> [[TMP29]], [[TMP37]] +; AVX1-NEXT: store <4 x i64> [[TMP21]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX1-NEXT: store <4 x i64> [[TMP38]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @shl_v8i64( @@ -241,134 +265,150 @@ define void @shl_v32i16() { ; SSE-LABEL: @shl_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = shl i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = shl i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = shl i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = shl i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = shl i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = shl i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = shl i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = shl i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = shl i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = shl i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = shl i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = shl i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = shl i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = shl i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = shl i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = shl i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = shl i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = shl i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = shl i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = shl i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = shl i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = shl i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = shl i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = shl i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = shl i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = shl i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = shl i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = shl i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = shl i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = shl i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = shl i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = shl i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> undef, i16 [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP11]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP13]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP15]], i32 3 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP17]], i32 4 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP19]], i32 5 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP21]], i32 6 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP23]], i32 7 +; SSE-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <8 x i16> undef, i16 [[TMP25]], i32 0 +; SSE-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <8 x i16> [[TMP26]], i16 [[TMP27]], i32 1 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[TMP29]], i32 2 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <8 x i16> [[TMP30]], i16 [[TMP31]], i32 3 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[TMP33]], i32 4 +; SSE-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; SSE-NEXT: [[TMP36:%.*]] = insertelement <8 x i16> [[TMP34]], i16 [[TMP35]], i32 5 +; SSE-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; SSE-NEXT: [[TMP38:%.*]] = insertelement <8 x i16> [[TMP36]], i16 [[TMP37]], i32 6 +; SSE-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; SSE-NEXT: [[TMP40:%.*]] = insertelement <8 x i16> [[TMP38]], i16 [[TMP39]], i32 7 +; SSE-NEXT: [[TMP41:%.*]] = shl <8 x i16> [[TMP24]], [[TMP40]] +; SSE-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <8 x i16> undef, i16 [[TMP42]], i32 0 +; SSE-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <8 x i16> [[TMP43]], i16 [[TMP44]], i32 1 +; SSE-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <8 x i16> [[TMP45]], i16 [[TMP46]], i32 2 +; SSE-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <8 x i16> [[TMP47]], i16 [[TMP48]], i32 3 +; SSE-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <8 x i16> [[TMP49]], i16 [[TMP50]], i32 4 +; SSE-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; SSE-NEXT: [[TMP53:%.*]] = insertelement <8 x i16> [[TMP51]], i16 [[TMP52]], i32 5 +; SSE-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; SSE-NEXT: [[TMP55:%.*]] = insertelement <8 x i16> [[TMP53]], i16 [[TMP54]], i32 6 +; SSE-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; SSE-NEXT: [[TMP57:%.*]] = insertelement <8 x i16> [[TMP55]], i16 [[TMP56]], i32 7 +; SSE-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP59:%.*]] = insertelement <8 x i16> undef, i16 [[TMP58]], i32 0 +; SSE-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[TMP61:%.*]] = insertelement <8 x i16> [[TMP59]], i16 [[TMP60]], i32 1 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; SSE-NEXT: [[TMP63:%.*]] = insertelement <8 x i16> [[TMP61]], i16 [[TMP62]], i32 2 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; SSE-NEXT: [[TMP65:%.*]] = insertelement <8 x i16> [[TMP63]], i16 [[TMP64]], i32 3 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; SSE-NEXT: [[TMP67:%.*]] = insertelement <8 x i16> [[TMP65]], i16 [[TMP66]], i32 4 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; SSE-NEXT: [[TMP69:%.*]] = insertelement <8 x i16> [[TMP67]], i16 [[TMP68]], i32 5 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; SSE-NEXT: [[TMP71:%.*]] = insertelement <8 x i16> [[TMP69]], i16 [[TMP70]], i32 6 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; SSE-NEXT: [[TMP73:%.*]] = insertelement <8 x i16> [[TMP71]], i16 [[TMP72]], i32 7 +; SSE-NEXT: [[TMP74:%.*]] = shl <8 x i16> [[TMP57]], [[TMP73]] +; SSE-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP76:%.*]] = insertelement <8 x i16> undef, i16 [[TMP75]], i32 0 +; SSE-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP78:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[TMP77]], i32 1 +; SSE-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP80:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[TMP79]], i32 2 +; SSE-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP82:%.*]] = insertelement <8 x i16> [[TMP80]], i16 [[TMP81]], i32 3 +; SSE-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[TMP84:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[TMP83]], i32 4 +; SSE-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[TMP86:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[TMP85]], i32 5 +; SSE-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[TMP88:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[TMP87]], i32 6 +; SSE-NEXT: [[TMP89:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[TMP90:%.*]] = insertelement <8 x i16> [[TMP88]], i16 [[TMP89]], i32 7 +; SSE-NEXT: [[TMP91:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP92:%.*]] = insertelement <8 x i16> undef, i16 [[TMP91]], i32 0 +; SSE-NEXT: [[TMP93:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[TMP94:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[TMP93]], i32 1 +; SSE-NEXT: [[TMP95:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; SSE-NEXT: [[TMP96:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[TMP95]], i32 2 +; SSE-NEXT: [[TMP97:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; SSE-NEXT: [[TMP98:%.*]] = insertelement <8 x i16> [[TMP96]], i16 [[TMP97]], i32 3 +; SSE-NEXT: [[TMP99:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; SSE-NEXT: [[TMP100:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[TMP99]], i32 4 +; SSE-NEXT: [[TMP101:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; SSE-NEXT: [[TMP102:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[TMP101]], i32 5 +; SSE-NEXT: [[TMP103:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; SSE-NEXT: [[TMP104:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[TMP103]], i32 6 +; SSE-NEXT: [[TMP105:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; SSE-NEXT: [[TMP106:%.*]] = insertelement <8 x i16> [[TMP104]], i16 [[TMP105]], i32 7 +; SSE-NEXT: [[TMP107:%.*]] = shl <8 x i16> [[TMP90]], [[TMP106]] +; SSE-NEXT: [[TMP108:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP109:%.*]] = insertelement <8 x i16> undef, i16 [[TMP108]], i32 0 +; SSE-NEXT: [[TMP110:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP111:%.*]] = insertelement <8 x i16> [[TMP109]], i16 [[TMP110]], i32 1 +; SSE-NEXT: [[TMP112:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP113:%.*]] = insertelement <8 x i16> [[TMP111]], i16 [[TMP112]], i32 2 +; SSE-NEXT: [[TMP114:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP115:%.*]] = insertelement <8 x i16> [[TMP113]], i16 [[TMP114]], i32 3 +; SSE-NEXT: [[TMP116:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; SSE-NEXT: [[TMP117:%.*]] = insertelement <8 x i16> [[TMP115]], i16 [[TMP116]], i32 4 +; SSE-NEXT: [[TMP118:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; SSE-NEXT: [[TMP119:%.*]] = insertelement <8 x i16> [[TMP117]], i16 [[TMP118]], i32 5 +; SSE-NEXT: [[TMP120:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; SSE-NEXT: [[TMP121:%.*]] = insertelement <8 x i16> [[TMP119]], i16 [[TMP120]], i32 6 +; SSE-NEXT: [[TMP122:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; SSE-NEXT: [[TMP123:%.*]] = insertelement <8 x i16> [[TMP121]], i16 [[TMP122]], i32 7 +; SSE-NEXT: [[TMP124:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP125:%.*]] = insertelement <8 x i16> undef, i16 [[TMP124]], i32 0 +; SSE-NEXT: [[TMP126:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[TMP127:%.*]] = insertelement <8 x i16> [[TMP125]], i16 [[TMP126]], i32 1 +; SSE-NEXT: [[TMP128:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2 +; SSE-NEXT: [[TMP129:%.*]] = insertelement <8 x i16> [[TMP127]], i16 [[TMP128]], i32 2 +; SSE-NEXT: [[TMP130:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3 +; SSE-NEXT: [[TMP131:%.*]] = insertelement <8 x i16> [[TMP129]], i16 [[TMP130]], i32 3 +; SSE-NEXT: [[TMP132:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4 +; SSE-NEXT: [[TMP133:%.*]] = insertelement <8 x i16> [[TMP131]], i16 [[TMP132]], i32 4 +; SSE-NEXT: [[TMP134:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5 +; SSE-NEXT: [[TMP135:%.*]] = insertelement <8 x i16> [[TMP133]], i16 [[TMP134]], i32 5 +; SSE-NEXT: [[TMP136:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +; SSE-NEXT: [[TMP137:%.*]] = insertelement <8 x i16> [[TMP135]], i16 [[TMP136]], i32 6 +; SSE-NEXT: [[TMP138:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +; SSE-NEXT: [[TMP139:%.*]] = insertelement <8 x i16> [[TMP137]], i16 [[TMP138]], i32 7 +; SSE-NEXT: [[TMP140:%.*]] = shl <8 x i16> [[TMP123]], [[TMP139]] +; SSE-NEXT: store <8 x i16> [[TMP41]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP74]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP107]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP140]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v32i16( Index: test/Transforms/SLPVectorizer/X86/sitofp.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/sitofp.ll +++ test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -1,8 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ @@ -26,31 +23,26 @@ ; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 ; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double ; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; SSE-NEXT: ret void ; +; AVX-LABEL: @sitofp_2i64_2f64( +; AVX-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> +; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_2i64_2f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 ; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double ; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_2i64_2f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> -; AVX512-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_2i64_2f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> -; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %cvt0 = sitofp i64 %ld0 to double @@ -70,12 +62,20 @@ ; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double ; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double ; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX-LABEL: @sitofp_4i64_4f64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_4i64_4f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -85,24 +85,12 @@ ; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double ; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double ; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_4i64_4f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX512-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_4i64_4f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX256DQ-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -136,16 +124,35 @@ ; SSE-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to double ; SSE-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to double ; SSE-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i64_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; +; AVX256-LABEL: @sitofp_8i64_8f64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 +; AVX256-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_8i64_8f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -163,31 +170,17 @@ ; AVX256NODQ-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to double ; AVX256NODQ-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to double ; AVX256NODQ-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = insertelement <4 x double> undef, double [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i64_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_8i64_8f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; AVX256DQ-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX256DQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double> -; AVX256DQ-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 -; AVX256DQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -221,8 +214,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 ; CHECK-NEXT: [[CVT0:%.*]] = sitofp i32 [[LD0]] to double ; CHECK-NEXT: [[CVT1:%.*]] = sitofp i32 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 @@ -244,10 +238,12 @@ ; SSE-NEXT: [[CVT1:%.*]] = sitofp i32 [[LD1]] to double ; SSE-NEXT: [[CVT2:%.*]] = sitofp i32 [[LD2]] to double ; SSE-NEXT: [[CVT3:%.*]] = sitofp i32 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i32_4f64( @@ -289,16 +285,26 @@ ; SSE-NEXT: [[CVT5:%.*]] = sitofp i32 [[LD5]] to double ; SSE-NEXT: [[CVT6:%.*]] = sitofp i32 [[LD6]] to double ; SSE-NEXT: [[CVT7:%.*]] = sitofp i32 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i32_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_8i32_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 @@ -307,12 +313,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i32_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 @@ -347,8 +347,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 ; CHECK-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double ; CHECK-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 @@ -370,10 +371,12 @@ ; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double ; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to double ; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i16_4f64( @@ -415,16 +418,26 @@ ; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to double ; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to double ; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i16_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_8i16_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 @@ -433,12 +446,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i16_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -473,8 +480,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 ; CHECK-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double ; CHECK-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 @@ -496,10 +504,12 @@ ; SSE-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double ; SSE-NEXT: [[CVT2:%.*]] = sitofp i8 [[LD2]] to double ; SSE-NEXT: [[CVT3:%.*]] = sitofp i8 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i8_4f64( @@ -541,16 +551,26 @@ ; SSE-NEXT: [[CVT5:%.*]] = sitofp i8 [[LD5]] to double ; SSE-NEXT: [[CVT6:%.*]] = sitofp i8 [[LD6]] to double ; SSE-NEXT: [[CVT7:%.*]] = sitofp i8 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i8_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_8i8_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 @@ -559,12 +579,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i8_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 @@ -626,39 +640,34 @@ ; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float ; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float ; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; SSE-NEXT: ret void ; +; AVX-LABEL: @sitofp_4i64_4f32( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_4i64_4f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = sitofp <2 x i64> [[TMP2]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_4i64_4f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_4i64_4f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX256DQ-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -692,55 +701,47 @@ ; SSE-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to float ; SSE-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to float ; SSE-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX-LABEL: @sitofp_8i64_8f32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> +; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_8i64_8f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: [[CVT4:%.*]] = sitofp i64 [[LD4]] to float -; AVX256NODQ-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to float -; AVX256NODQ-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to float -; AVX256NODQ-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <8 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 4 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 5 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 6 +; AVX256NODQ-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; AVX256NODQ-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 7 +; AVX256NODQ-NEXT: store <8 x float> [[TMP20]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i64_8f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX512-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_8i64_8f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX256DQ-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -849,6 +850,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_16i32_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_16i32_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 @@ -857,12 +864,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_16i32_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4 @@ -996,6 +997,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_16i16_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_16i16_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 @@ -1004,12 +1011,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_16i16_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2 @@ -1143,6 +1144,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_16i8_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_16i8_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 @@ -1151,12 +1158,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_16i8_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1 Index: test/Transforms/SLPVectorizer/X86/uitofp.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/uitofp.ll +++ test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -1,8 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ @@ -83,6 +80,12 @@ ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i64_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i64_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 @@ -91,12 +94,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i64_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -131,31 +128,26 @@ ; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 ; SSE-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double ; SSE-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; SSE-NEXT: ret void ; +; AVX-LABEL: @uitofp_2i32_2f64( +; AVX-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> +; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @uitofp_2i32_2f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 ; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double ; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_2i32_2f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> -; AVX512-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_2i32_2f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> -; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 %cvt0 = uitofp i32 %ld0 to double @@ -175,10 +167,12 @@ ; SSE-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double ; SSE-NEXT: [[CVT2:%.*]] = uitofp i32 [[LD2]] to double ; SSE-NEXT: [[CVT3:%.*]] = uitofp i32 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i32_4f64( @@ -220,16 +214,26 @@ ; SSE-NEXT: [[CVT5:%.*]] = uitofp i32 [[LD5]] to double ; SSE-NEXT: [[CVT6:%.*]] = uitofp i32 [[LD6]] to double ; SSE-NEXT: [[CVT7:%.*]] = uitofp i32 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i32_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i32_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 @@ -238,12 +242,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i32_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 @@ -278,8 +276,9 @@ ; CHECK-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 ; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double ; CHECK-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 @@ -301,10 +300,12 @@ ; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double ; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to double ; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i16_4f64( @@ -346,16 +347,26 @@ ; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to double ; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to double ; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i16_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i16_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 @@ -364,12 +375,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i16_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -404,31 +409,26 @@ ; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 ; SSE-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double ; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; SSE-NEXT: ret void ; +; AVX-LABEL: @uitofp_2i8_2f64( +; AVX-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @uitofp_2i8_2f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 ; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double ; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_2i8_2f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> -; AVX512-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_2i8_2f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> -; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 %cvt0 = uitofp i8 %ld0 to double @@ -448,10 +448,12 @@ ; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double ; SSE-NEXT: [[CVT2:%.*]] = uitofp i8 [[LD2]] to double ; SSE-NEXT: [[CVT3:%.*]] = uitofp i8 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i8_4f64( @@ -493,16 +495,26 @@ ; SSE-NEXT: [[CVT5:%.*]] = uitofp i8 [[LD5]] to double ; SSE-NEXT: [[CVT6:%.*]] = uitofp i8 [[LD6]] to double ; SSE-NEXT: [[CVT7:%.*]] = uitofp i8 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[CVT2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CVT3]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[CVT5]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[CVT6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CVT7]], i32 1 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i8_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i8_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 @@ -511,12 +523,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i8_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 @@ -578,39 +584,34 @@ ; SSE-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float ; SSE-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float ; SSE-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; SSE-NEXT: ret void ; +; AVX-LABEL: @uitofp_4i64_4f32( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @uitofp_4i64_4f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_4i64_4f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_4i64_4f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX256DQ-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -644,55 +645,47 @@ ; SSE-NEXT: [[CVT5:%.*]] = uitofp i64 [[LD5]] to float ; SSE-NEXT: [[CVT6:%.*]] = uitofp i64 [[LD6]] to float ; SSE-NEXT: [[CVT7:%.*]] = uitofp i64 [[LD7]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX-LABEL: @uitofp_8i64_8f32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> +; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @uitofp_8i64_8f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: [[CVT4:%.*]] = uitofp i64 [[LD4]] to float -; AVX256NODQ-NEXT: [[CVT5:%.*]] = uitofp i64 [[LD5]] to float -; AVX256NODQ-NEXT: [[CVT6:%.*]] = uitofp i64 [[LD6]] to float -; AVX256NODQ-NEXT: [[CVT7:%.*]] = uitofp i64 [[LD7]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <8 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 4 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 5 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 6 +; AVX256NODQ-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; AVX256NODQ-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 7 +; AVX256NODQ-NEXT: store <8 x float> [[TMP20]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i64_8f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX512-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_8i64_8f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX256DQ-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -801,6 +794,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_16i32_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_16i32_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 @@ -809,12 +808,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_16i32_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4 @@ -948,6 +941,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_16i16_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_16i16_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 @@ -956,12 +955,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_16i16_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2 @@ -1095,6 +1088,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_16i8_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_16i8_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 @@ -1103,12 +1102,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_16i8_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1