Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -541,6 +541,7 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + RemovedOprations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -600,6 +601,9 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + /// Reduce the cost of tree to make it patrially vectorizable, if possible. + bool reduceTreeCost(int Delta); + OptimizationRemarkEmitter *getORE() { return ORE; } private: @@ -700,6 +704,9 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Cost of the tree entry. + int Cost; }; /// Create a new VectorizableTree entry. @@ -742,6 +749,8 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + SmallVector RemovedOprations; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -2413,6 +2422,85 @@ } } +bool BoUpSLP::reduceTreeCost(int Delta) { + SmallVector Tree; + bool Reduced = false; + unsigned NumVectorized = 0; + int Sum = 0; + + // Look at tree elements in backward to the top, + // First it should be gathering nodes and then + // acutual vectorizable operation. + for (unsigned I = VectorizableTree.size(); I--;) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) { + NumVectorized++; + Sum += Entry->Cost; + Tree.push_back(Sum); + Sum = 0; + } else { + Sum += Entry->Cost; + } + } + + // Estimating where to stop vectorization. + Sum = 0; + unsigned StopAt = 0; + for (unsigned I = 0, E = Tree.size(); I < E; I++) { + Sum += Tree[I]; + if (Sum >= Delta) { + StopAt = I; + break; + } + } + + // Canceling unprofitable elements. + if (StopAt > 0 && StopAt < (Tree.size() - 1)) { + NumVectorized = 0; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) { + NumVectorized++; + if (NumVectorized > StopAt) { + NumVectorized++; + InstructionsState S = getSameOpcode(Entry->Scalars); + auto *VL0 = cast(S.OpValue); + // Aviod any PHI nodes or trees with alternative operations. + if (VL0->getOpcode() == Instruction::PHI || S.AltOp) + return false; + } else + break; + } + } + LLVM_DEBUG(dbgs() << "SLP: reduced the tree cost by " << Delta + << " to make it partially vectorizable.\n"); + NumVectorized = 0; + Reduced = true; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) { + NumVectorized++; + if (NumVectorized > StopAt) { + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + RemovedOprations.push_back(I); + ScalarToTreeEntry.erase(V); + ExternalUses.erase(std::remove_if(ExternalUses.begin(), + ExternalUses.end(), + [&](ExternalUser &EU) { + return EU.Scalar == V; + }), + ExternalUses.end()); + } + } else + break; + } + } + } + + return Reduced; +} + bool BoUpSLP::isFullyVectorizableTinyTree() { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); @@ -2468,7 +2556,8 @@ SmallPtrSet LiveValues; Instruction *PrevInst = nullptr; - for (const auto &N : VectorizableTree) { + for (auto &N : VectorizableTree) { + int EntryCost = 0; Instruction *Inst = dyn_cast(N.Scalars[0]); if (!Inst) continue; @@ -2508,15 +2597,19 @@ !isa(&*PrevInstIt)) && &*PrevInstIt != PrevInst) { SmallVector V; + int ElementCost; for (auto *II : LiveValues) V.push_back(VectorType::get(II->getType(), BundleWidth)); - Cost += TTI->getCostOfKeepingLiveOverCall(V); + ElementCost = TTI->getCostOfKeepingLiveOverCall(V); + Cost += ElementCost; + EntryCost += ElementCost; } ++PrevInstIt; } PrevInst = Inst; + N.Cost += EntryCost; } return Cost; @@ -2552,6 +2645,7 @@ continue; int C = getEntryCost(&TE); + TE.Cost = C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); @@ -3574,7 +3668,36 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!RemovedOprations.empty()) { + bool Removed = false; + for (int I: RemovedOprations) { + TreeEntry *Entry = &VectorizableTree[I]; + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (Removed) { + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, [&](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + } + } + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -4729,6 +4852,18 @@ // Move to the next bundle. i += VF - 1; Changed = true; + } else { + // Try to reduce the tree to make it patrially vectorizable. + int Delta = Cost - SLPCostThreshold + 1; + bool Reduced = false; + if (R.getTreeSize() > 1) { + Reduced = R.reduceTreeCost(Delta); + } + if (Reduced) { + R.vectorizeTree(); + i += VF - 1; + Changed = true; + } } } Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -39,19 +39,27 @@ ; CHECK-LABEL: @store_chain_v2i64( ; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 ; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 ; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 ; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 ; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 ; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[V0_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[V1_0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[V0_1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[V1_1]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i64> [[TMP7]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 Index: test/Transforms/SLPVectorizer/NVPTX/v2f16.ll =================================================================== --- test/Transforms/SLPVectorizer/NVPTX/v2f16.ll +++ test/Transforms/SLPVectorizer/NVPTX/v2f16.ll @@ -31,18 +31,16 @@ ; NOVECTOR-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 ; NOVECTOR-NEXT: [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half* ; NOVECTOR-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]] -; NOVECTOR-NEXT: [[TMP12:%.*]] = load half, half* [[TMP11]], align 8 -; NOVECTOR-NEXT: [[TMP13:%.*]] = fmul fast half [[TMP12]], 0xH5380 -; NOVECTOR-NEXT: [[TMP14:%.*]] = fadd fast half [[TMP13]], 0xH57F0 ; NOVECTOR-NEXT: [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half* ; NOVECTOR-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]] -; NOVECTOR-NEXT: store half [[TMP14]], half* [[TMP16]], align 8 ; NOVECTOR-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]] -; NOVECTOR-NEXT: [[TMP18:%.*]] = load half, half* [[TMP17]], align 2 -; NOVECTOR-NEXT: [[TMP19:%.*]] = fmul fast half [[TMP18]], 0xH5380 -; NOVECTOR-NEXT: [[TMP20:%.*]] = fadd fast half [[TMP19]], 0xH57F0 +; NOVECTOR-NEXT: [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>* +; NOVECTOR-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8 +; NOVECTOR-NEXT: [[TMP3:%.*]] = fmul fast <2 x half> , [[TMP2]] +; NOVECTOR-NEXT: [[TMP4:%.*]] = fadd fast <2 x half> , [[TMP3]] ; NOVECTOR-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] -; NOVECTOR-NEXT: store half [[TMP20]], half* [[TMP21]], align 2 +; NOVECTOR-NEXT: [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>* +; NOVECTOR-NEXT: store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8 ; NOVECTOR-NEXT: ret void ; %tmp = shl nuw nsw i32 %arg2, 6 Index: test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll +++ test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll @@ -27,104 +27,53 @@ define void @add_v8i64() { ; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SSE-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SSE-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @add_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @add_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: ret void +; AVX-LABEL: @add_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @add_v8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 @@ -132,17 +81,6 @@ ; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) ; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void -; -; AVX256BW-LABEL: @add_v8i64( -; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: ret void ; %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 Index: test/Transforms/SLPVectorizer/X86/arith-add-usat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-add-usat.ll +++ test/Transforms/SLPVectorizer/X86/arith-add-usat.ll @@ -27,38 +27,22 @@ define void @add_v8i64() { ; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SSE-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SSE-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( Index: test/Transforms/SLPVectorizer/X86/arith-mul.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -22,120 +22,53 @@ define void @mul_v8i64() { ; SSE-LABEL: @mul_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = mul <2 x i64> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @mul_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] -; SLM-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] -; SLM-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] -; SLM-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] -; SLM-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] -; SLM-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] -; SLM-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] -; SLM-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = mul <2 x i64> [[TMP1]], [[TMP5]] +; SLM-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[TMP2]], [[TMP6]] +; SLM-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[TMP3]], [[TMP7]] +; SLM-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[TMP4]], [[TMP8]] +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @mul_v8i64( -; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] -; AVX1-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] -; AVX1-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] -; AVX1-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] -; AVX1-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] -; AVX1-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] -; AVX1-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] -; AVX1-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] -; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @mul_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]] -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: ret void +; AVX-LABEL: @mul_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @mul_v8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 Index: test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll +++ test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll @@ -27,104 +27,53 @@ define void @sub_v8i64() { ; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SSE-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SSE-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @sub_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @sub_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: ret void +; AVX-LABEL: @sub_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @sub_v8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 @@ -132,17 +81,6 @@ ; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) ; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void -; -; AVX256BW-LABEL: @sub_v8i64( -; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: ret void ; %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 Index: test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll +++ test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll @@ -27,38 +27,22 @@ define void @sub_v8i64() { ; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SSE-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SSE-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( Index: test/Transforms/SLPVectorizer/X86/bswap.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/bswap.ll +++ test/Transforms/SLPVectorizer/X86/bswap.ll @@ -17,20 +17,11 @@ declare i16 @llvm.bswap.i16(i16) define void @bswap_2i64() #0 { -; SSE-LABEL: @bswap_2i64( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]]) -; SSE-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]]) -; SSE-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: ret void -; -; AVX-LABEL: @bswap_2i64( -; AVX-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP1]]) -; AVX-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 -; AVX-NEXT: ret void +; CHECK-LABEL: @bswap_2i64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP1]]) +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 +; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 @@ -43,18 +34,12 @@ define void @bswap_4i64() #0 { ; SSE-LABEL: @bswap_4i64( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; SSE-NEXT: [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]]) -; SSE-NEXT: [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]]) -; SSE-NEXT: [[BSWAP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD2]]) -; SSE-NEXT: [[BSWAP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD3]]) -; SSE-NEXT: store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; SSE-NEXT: store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; SSE-NEXT: store i64 [[BSWAP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; SSE-NEXT: store i64 [[BSWAP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP2]]) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @bswap_4i64( Index: test/Transforms/SLPVectorizer/X86/cast.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cast.ll +++ test/Transforms/SLPVectorizer/X86/cast.ll @@ -78,18 +78,11 @@ define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) { ; CHECK-LABEL: @test_sext_4i16_to_4i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <4 x i16>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4 ; CHECK-NEXT: ret i64 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/ctlz.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/ctlz.ll +++ test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -26,12 +26,9 @@ define void @ctlz_2i64() #0 { ; CHECK-LABEL: @ctlz_2i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 false) +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -44,20 +41,20 @@ } define void @ctlz_4i64() #0 { -; CHECK-LABEL: @ctlz_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false) -; CHECK-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @ctlz_4i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 false) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP2]], i1 false) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ctlz_4i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 false) +; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -76,18 +73,9 @@ define void @ctlz_4i32() #0 { ; CHECK-LABEL: @ctlz_4i32( -; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; CHECK-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; CHECK-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; CHECK-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -107,64 +95,19 @@ define void @ctlz_8i32() #0 { ; SSE-LABEL: @ctlz_8i32( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; SSE-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) -; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) -; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) -; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) -; SSE-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 false) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ctlz_8i32( -; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; AVX1-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) -; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) -; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) -; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) -; AVX1-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @ctlz_8i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 -; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) -; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 -; AVX2-NEXT: ret void +; AVX-LABEL: @ctlz_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 +; AVX-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 @@ -467,12 +410,9 @@ define void @ctlz_undef_2i64() #0 { ; CHECK-LABEL: @ctlz_undef_2i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 true) +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -485,20 +425,20 @@ } define void @ctlz_undef_4i64() #0 { -; CHECK-LABEL: @ctlz_undef_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true) -; CHECK-NEXT: [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true) -; CHECK-NEXT: store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @ctlz_undef_4i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 true) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP2]], i1 true) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ctlz_undef_4i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 true) +; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -517,18 +457,9 @@ define void @ctlz_undef_4i32() #0 { ; CHECK-LABEL: @ctlz_undef_4i32( -; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true) -; CHECK-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true) -; CHECK-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true) -; CHECK-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true) -; CHECK-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true) +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -548,64 +479,19 @@ define void @ctlz_undef_8i32() #0 { ; SSE-LABEL: @ctlz_undef_8i32( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true) -; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true) -; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true) -; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true) -; SSE-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true) -; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true) -; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true) -; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true) -; SSE-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 true) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ctlz_undef_8i32( -; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true) -; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true) -; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true) -; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true) -; AVX1-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true) -; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true) -; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true) -; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true) -; AVX1-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @ctlz_undef_8i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 -; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true) -; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 -; AVX2-NEXT: ret void +; AVX-LABEL: @ctlz_undef_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true) +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 +; AVX-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 Index: test/Transforms/SLPVectorizer/X86/ctpop.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/ctpop.ll +++ test/Transforms/SLPVectorizer/X86/ctpop.ll @@ -22,12 +22,9 @@ define void @ctpop_2i64() #0 { ; CHECK-LABEL: @ctpop_2i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) -; CHECK-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) -; CHECK-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]]) +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -41,40 +38,19 @@ define void @ctpop_4i64() #0 { ; SSE-LABEL: @ctpop_4i64( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; SSE-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) -; SSE-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) -; SSE-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]]) -; SSE-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]]) -; SSE-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; SSE-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; SSE-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; SSE-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]]) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ctpop_4i64( -; AVX1-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; AVX1-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; AVX1-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; AVX1-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; AVX1-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) -; AVX1-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) -; AVX1-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]]) -; AVX1-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]]) -; AVX1-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; AVX1-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; AVX1-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; AVX1-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @ctpop_4i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP1]]) -; AVX2-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 -; AVX2-NEXT: ret void +; AVX-LABEL: @ctpop_4i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP1]]) +; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -92,41 +68,11 @@ } define void @ctpop_4i32() #0 { -; SSE2-LABEL: @ctpop_4i32( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 -; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]]) -; SSE2-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctpop_4i32( -; SSE42-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; SSE42-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; SSE42-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; SSE42-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE42-NEXT: ret void -; -; AVX-LABEL: @ctpop_4i32( -; AVX-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; AVX-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; AVX-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; AVX-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; AVX-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; AVX-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; AVX-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; AVX-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; AVX-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX-NEXT: ret void +; CHECK-LABEL: @ctpop_4i32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 +; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 @@ -144,74 +90,20 @@ } define void @ctpop_8i32() #0 { -; SSE2-LABEL: @ctpop_8i32( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE2-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]]) -; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]]) -; SSE2-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 -; SSE2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @ctpop_8i32( -; SSE42-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE42-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE42-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE42-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE42-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE42-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE42-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE42-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; SSE42-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]]) -; SSE42-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]]) -; SSE42-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]]) -; SSE42-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]]) -; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE42-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE42-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE42-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE42-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; SSE42-NEXT: ret void -; -; AVX1-LABEL: @ctpop_8i32( -; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; AVX1-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; AVX1-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; AVX1-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; AVX1-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; AVX1-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]]) -; AVX1-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]]) -; AVX1-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]]) -; AVX1-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]]) -; AVX1-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; AVX1-NEXT: ret void +; SSE-LABEL: @ctpop_8i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: ret void ; -; AVX2-LABEL: @ctpop_8i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 -; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> [[TMP1]]) -; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 -; AVX2-NEXT: ret void +; AVX-LABEL: @ctpop_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 +; AVX-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 Index: test/Transforms/SLPVectorizer/X86/cttz.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cttz.ll +++ test/Transforms/SLPVectorizer/X86/cttz.ll @@ -26,12 +26,9 @@ define void @cttz_2i64() #0 { ; CHECK-LABEL: @cttz_2i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP1]], i1 false) +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -44,20 +41,20 @@ } define void @cttz_4i64() #0 { -; CHECK-LABEL: @cttz_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false) -; CHECK-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false) -; CHECK-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @cttz_4i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP1]], i1 false) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP2]], i1 false) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @cttz_4i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> [[TMP1]], i1 false) +; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -76,18 +73,9 @@ define void @cttz_4i32() #0 { ; CHECK-LABEL: @cttz_4i32( -; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) -; CHECK-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) -; CHECK-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) -; CHECK-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -107,64 +95,19 @@ define void @cttz_8i32() #0 { ; SSE-LABEL: @cttz_8i32( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) -; SSE-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) -; SSE-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) -; SSE-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) -; SSE-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false) -; SSE-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false) -; SSE-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false) -; SSE-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false) -; SSE-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @cttz_8i32( -; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) -; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) -; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) -; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) -; AVX1-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false) -; AVX1-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false) -; AVX1-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false) -; AVX1-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false) -; AVX1-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @cttz_8i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 -; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) -; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 -; AVX2-NEXT: ret void +; AVX-LABEL: @cttz_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 +; AVX-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 @@ -467,12 +410,9 @@ define void @cttz_undef_2i64() #0 { ; CHECK-LABEL: @cttz_undef_2i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP1]], i1 true) +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 ; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 @@ -485,20 +425,20 @@ } define void @cttz_undef_4i64() #0 { -; CHECK-LABEL: @cttz_undef_4i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true) -; CHECK-NEXT: [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true) -; CHECK-NEXT: [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true) -; CHECK-NEXT: store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; CHECK-NEXT: store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; CHECK-NEXT: store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; CHECK-NEXT: store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @cttz_undef_4i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP1]], i1 true) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP2]], i1 true) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @cttz_undef_4i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> [[TMP1]], i1 true) +; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 @@ -517,18 +457,9 @@ define void @cttz_undef_4i32() #0 { ; CHECK-LABEL: @cttz_undef_4i32( -; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true) -; CHECK-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true) -; CHECK-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true) -; CHECK-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true) -; CHECK-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; CHECK-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -548,64 +479,19 @@ define void @cttz_undef_8i32() #0 { ; SSE-LABEL: @cttz_undef_8i32( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true) -; SSE-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true) -; SSE-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true) -; SSE-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true) -; SSE-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 true) -; SSE-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true) -; SSE-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true) -; SSE-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true) -; SSE-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @cttz_undef_8i32( -; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true) -; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true) -; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true) -; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true) -; AVX1-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 true) -; AVX1-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true) -; AVX1-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true) -; AVX1-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true) -; AVX1-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @cttz_undef_8i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 -; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 true) -; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 -; AVX2-NEXT: ret void +; AVX-LABEL: @cttz_undef_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 true) +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 +; AVX-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 Index: test/Transforms/SLPVectorizer/X86/fma.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fma.ll +++ test/Transforms/SLPVectorizer/X86/fma.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256 @@ -26,16 +25,11 @@ define void @fma_2f64() #0 { ; NO-FMA-LABEL: @fma_2f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]) +; NO-FMA-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_2f64( @@ -61,26 +55,11 @@ define void @fma_4f64() #0 { ; NO-FMA-LABEL: @fma_4f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 8 +; NO-FMA-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]]) +; NO-FMA-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_4f64( @@ -116,46 +95,16 @@ define void @fma_8f64() #0 { ; NO-FMA-LABEL: @fma_8f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call double @llvm.fma.f64(double [[A4]], double [[B4]], double [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call double @llvm.fma.f64(double [[A5]], double [[B5]], double [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call double @llvm.fma.f64(double [[A6]], double [[B6]], double [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call double @llvm.fma.f64(double [[A7]], double [[B7]], double [[C7]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4 +; NO-FMA-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]]) +; NO-FMA-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]]) +; NO-FMA-NEXT: store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; NO-FMA-NEXT: store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA256-LABEL: @fma_8f64( @@ -224,26 +173,11 @@ define void @fma_4f32() #0 { ; NO-FMA-LABEL: @fma_4f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]]) +; NO-FMA-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_4f32( @@ -279,46 +213,11 @@ define void @fma_8f32() #0 { ; NO-FMA-LABEL: @fma_8f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]]) +; NO-FMA-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_8f32( @@ -374,86 +273,16 @@ define void @fma_16f32() #0 { ; NO-FMA-LABEL: @fma_16f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[C9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[C10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[C11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[C12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[C13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[C14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[C15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]]) -; NO-FMA-NEXT: [[FMA8:%.*]] = call float @llvm.fma.f32(float [[A8]], float [[B8]], float [[C8]]) -; NO-FMA-NEXT: [[FMA9:%.*]] = call float @llvm.fma.f32(float [[A9]], float [[B9]], float [[C9]]) -; NO-FMA-NEXT: [[FMA10:%.*]] = call float @llvm.fma.f32(float [[A10]], float [[B10]], float [[C10]]) -; NO-FMA-NEXT: [[FMA11:%.*]] = call float @llvm.fma.f32(float [[A11]], float [[B11]], float [[C11]]) -; NO-FMA-NEXT: [[FMA12:%.*]] = call float @llvm.fma.f32(float [[A12]], float [[B12]], float [[C12]]) -; NO-FMA-NEXT: [[FMA13:%.*]] = call float @llvm.fma.f32(float [[A13]], float [[B13]], float [[C13]]) -; NO-FMA-NEXT: [[FMA14:%.*]] = call float @llvm.fma.f32(float [[A14]], float [[B14]], float [[C14]]) -; NO-FMA-NEXT: [[FMA15:%.*]] = call float @llvm.fma.f32(float [[A15]], float [[B15]], float [[C15]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: store float [[FMA8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: store float [[FMA9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4 +; NO-FMA-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]]) +; NO-FMA-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]]) +; NO-FMA-NEXT: store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; NO-FMA-NEXT: store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA256-LABEL: @fma_16f32( Index: test/Transforms/SLPVectorizer/X86/fptosi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fptosi.ll +++ test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -21,73 +21,34 @@ define void @fptosi_8f64_8i64() #0 { ; SSE-LABEL: @fptosi_8f64_8i64( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i64> +; SSE-NEXT: [[TMP6:%.*]] = fptosi <2 x double> [[TMP2]] to <2 x i64> +; SSE-NEXT: [[TMP7:%.*]] = fptosi <2 x double> [[TMP3]] to <2 x i64> +; SSE-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP4]] to <2 x i64> +; SSE-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptosi_8f64_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: ret void +; AVX256-LABEL: @fptosi_8f64_8i64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX256-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64> +; AVX256-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64> +; AVX256-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fptosi_8f64_8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 ; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64> ; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptosi_8f64_8i64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 -; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256DQ-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -160,38 +121,11 @@ } define void @fptosi_8f64_8i16() #0 { -; SSE-LABEL: @fptosi_8f64_8i16( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i16 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i16 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i16 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i16 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i16 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i16 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i16 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i16 -; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 -; SSE-NEXT: ret void -; -; AVX-LABEL: @fptosi_8f64_8i16( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16> -; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX-NEXT: ret void +; CHECK-LABEL: @fptosi_8f64_8i16( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16> +; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 +; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -281,73 +215,34 @@ define void @fptosi_8f32_8i64() #0 { ; SSE-LABEL: @fptosi_8f32_8i64( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i64> +; SSE-NEXT: [[TMP6:%.*]] = fptosi <2 x float> [[TMP2]] to <2 x i64> +; SSE-NEXT: [[TMP7:%.*]] = fptosi <2 x float> [[TMP3]] to <2 x i64> +; SSE-NEXT: [[TMP8:%.*]] = fptosi <2 x float> [[TMP4]] to <2 x i64> +; SSE-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptosi_8f32_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: ret void +; AVX256-LABEL: @fptosi_8f32_8i64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64> +; AVX256-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64> +; AVX256-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fptosi_8f32_8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 ; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64> ; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptosi_8f32_8i64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 -; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256DQ-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 Index: test/Transforms/SLPVectorizer/X86/fptoui.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fptoui.ll +++ test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -21,73 +21,34 @@ define void @fptoui_8f64_8i64() #0 { ; SSE-LABEL: @fptoui_8f64_8i64( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i64> +; SSE-NEXT: [[TMP6:%.*]] = fptoui <2 x double> [[TMP2]] to <2 x i64> +; SSE-NEXT: [[TMP7:%.*]] = fptoui <2 x double> [[TMP3]] to <2 x i64> +; SSE-NEXT: [[TMP8:%.*]] = fptoui <2 x double> [[TMP4]] to <2 x i64> +; SSE-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptoui_8f64_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: ret void +; AVX256-LABEL: @fptoui_8f64_8i64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX256-NEXT: [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64> +; AVX256-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64> +; AVX256-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f64_8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 ; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64> ; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f64_8i64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 -; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256DQ-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -118,70 +79,19 @@ define void @fptoui_8f64_8i32() #0 { ; SSE-LABEL: @fptoui_8f64_8i32( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i32 -; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i32 -; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i32 -; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i32 -; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i32 -; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i32 -; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i32 -; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptoui_8f64_8i32( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i32 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i32 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i32 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i32 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i32 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i32 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i32 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i32 -; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @fptoui_8f64_8i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32> -; AVX512-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f64_8i32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32> -; AVX256DQ-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX256DQ-NEXT: ret void +; AVX-LABEL: @fptoui_8f64_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32> +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -211,71 +121,11 @@ } define void @fptoui_8f64_8i16() #0 { -; SSE-LABEL: @fptoui_8f64_8i16( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i16 -; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i16 -; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i16 -; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i16 -; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i16 -; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i16 -; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i16 -; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i16 -; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 -; SSE-NEXT: ret void -; -; AVX256NODQ-LABEL: @fptoui_8f64_8i16( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i16 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i16 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i16 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i16 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i16 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i16 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i16 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i16 -; AVX256NODQ-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; AVX256NODQ-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @fptoui_8f64_8i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16> -; AVX512-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f64_8i16( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16> -; AVX256DQ-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX256DQ-NEXT: ret void +; CHECK-LABEL: @fptoui_8f64_8i16( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16> +; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 +; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -365,73 +215,34 @@ define void @fptoui_8f32_8i64() #0 { ; SSE-LABEL: @fptoui_8f32_8i64( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i64> +; SSE-NEXT: [[TMP6:%.*]] = fptoui <2 x float> [[TMP2]] to <2 x i64> +; SSE-NEXT: [[TMP7:%.*]] = fptoui <2 x float> [[TMP3]] to <2 x i64> +; SSE-NEXT: [[TMP8:%.*]] = fptoui <2 x float> [[TMP4]] to <2 x i64> +; SSE-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptoui_8f32_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: ret void +; AVX256-LABEL: @fptoui_8f32_8i64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64> +; AVX256-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64> +; AVX256-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f32_8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 ; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64> ; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f32_8i64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 -; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256DQ-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 @@ -462,70 +273,19 @@ define void @fptoui_8f32_8i32() #0 { ; SSE-LABEL: @fptoui_8f32_8i32( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32 -; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32 -; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32 -; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32 -; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32 -; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 -; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 -; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptoui_8f32_8i32( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @fptoui_8f32_8i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32> -; AVX512-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f32_8i32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32> -; AVX256DQ-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX256DQ-NEXT: ret void +; AVX-LABEL: @fptoui_8f32_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32> +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 @@ -555,38 +315,11 @@ } define void @fptoui_8f32_8i16() #0 { -; SSE-LABEL: @fptoui_8f32_8i16( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i16 -; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i16 -; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i16 -; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i16 -; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i16 -; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i16 -; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i16 -; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i16 -; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2 -; SSE-NEXT: ret void -; -; AVX-LABEL: @fptoui_8f32_8i16( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i16> -; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 -; AVX-NEXT: ret void +; CHECK-LABEL: @fptoui_8f32_8i16( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i16> +; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2 +; CHECK-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 Index: test/Transforms/SLPVectorizer/X86/fround.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fround.ll +++ test/Transforms/SLPVectorizer/X86/fround.ll @@ -26,26 +26,11 @@ declare float @llvm.trunc.f32(float %p) define void @ceil_2f64() #0 { -; SSE2-LABEL: @ceil_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @ceil_2f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @ceil_2f64( -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) -; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; AVX-NEXT: ret void +; CHECK-LABEL: @ceil_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; CHECK-NEXT: ret void ; %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -57,29 +42,14 @@ } define void @ceil_4f64() #0 { -; SSE2-LABEL: @ceil_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @ceil_4f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @ceil_4f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX-LABEL: @ceil_4f64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -103,47 +73,20 @@ } define void @ceil_8f64() #0 { -; SSE2-LABEL: @ceil_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) -; SSE2-NEXT: [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[LD4]]) -; SSE2-NEXT: [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]]) -; SSE2-NEXT: [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]]) -; SSE2-NEXT: [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @ceil_8f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @ceil_8f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]]) +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @ceil_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -197,26 +140,11 @@ } define void @floor_2f64() #0 { -; SSE2-LABEL: @floor_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @floor_2f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @floor_2f64( -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) -; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; AVX-NEXT: ret void +; CHECK-LABEL: @floor_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; CHECK-NEXT: ret void ; %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -228,29 +156,14 @@ } define void @floor_4f64() #0 { -; SSE2-LABEL: @floor_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @floor_4f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @floor_4f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX-LABEL: @floor_4f64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -274,47 +187,20 @@ } define void @floor_8f64() #0 { -; SSE2-LABEL: @floor_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) -; SSE2-NEXT: [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[LD4]]) -; SSE2-NEXT: [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]]) -; SSE2-NEXT: [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]]) -; SSE2-NEXT: [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @floor_8f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @floor_8f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]]) +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @floor_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -368,26 +254,11 @@ } define void @nearbyint_2f64() #0 { -; SSE2-LABEL: @nearbyint_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @nearbyint_2f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @nearbyint_2f64( -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) -; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; AVX-NEXT: ret void +; CHECK-LABEL: @nearbyint_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; CHECK-NEXT: ret void ; %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -399,29 +270,14 @@ } define void @nearbyint_4f64() #0 { -; SSE2-LABEL: @nearbyint_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @nearbyint_4f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @nearbyint_4f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX-LABEL: @nearbyint_4f64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -445,47 +301,20 @@ } define void @nearbyint_8f64() #0 { -; SSE2-LABEL: @nearbyint_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) -; SSE2-NEXT: [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[LD4]]) -; SSE2-NEXT: [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]]) -; SSE2-NEXT: [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]]) -; SSE2-NEXT: [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @nearbyint_8f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @nearbyint_8f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]]) +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @nearbyint_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -539,26 +368,11 @@ } define void @rint_2f64() #0 { -; SSE2-LABEL: @rint_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @rint_2f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @rint_2f64( -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) -; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; AVX-NEXT: ret void +; CHECK-LABEL: @rint_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; CHECK-NEXT: ret void ; %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -570,29 +384,14 @@ } define void @rint_4f64() #0 { -; SSE2-LABEL: @rint_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @rint_4f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @rint_4f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX-LABEL: @rint_4f64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -616,47 +415,20 @@ } define void @rint_8f64() #0 { -; SSE2-LABEL: @rint_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) -; SSE2-NEXT: [[RINT4:%.*]] = call double @llvm.rint.f64(double [[LD4]]) -; SSE2-NEXT: [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]]) -; SSE2-NEXT: [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]]) -; SSE2-NEXT: [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @rint_8f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @rint_8f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]]) +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @rint_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -710,26 +482,11 @@ } define void @trunc_2f64() #0 { -; SSE2-LABEL: @trunc_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @trunc_2f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @trunc_2f64( -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) -; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; AVX-NEXT: ret void +; CHECK-LABEL: @trunc_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; CHECK-NEXT: ret void ; %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -741,29 +498,14 @@ } define void @trunc_4f64() #0 { -; SSE2-LABEL: @trunc_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @trunc_4f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @trunc_4f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_4f64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -787,47 +529,20 @@ } define void @trunc_8f64() #0 { -; SSE2-LABEL: @trunc_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) -; SSE2-NEXT: [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[LD4]]) -; SSE2-NEXT: [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]]) -; SSE2-NEXT: [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]]) -; SSE2-NEXT: [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @trunc_8f64( -; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: ret void +; SSE-LABEL: @trunc_8f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]]) +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @trunc_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 @@ -881,32 +596,11 @@ } define void @ceil_4f32() #0 { -; SSE2-LABEL: @ceil_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @ceil_4f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @ceil_4f32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) -; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; AVX-NEXT: ret void +; CHECK-LABEL: @ceil_4f32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; CHECK-NEXT: ret void ; %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 @@ -924,41 +618,14 @@ } define void @ceil_8f32() #0 { -; SSE2-LABEL: @ceil_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) -; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) -; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) -; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) -; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @ceil_8f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @ceil_8f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX-LABEL: @ceil_8f32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -994,71 +661,20 @@ } define void @ceil_16f32() #0 { -; SSE2-LABEL: @ceil_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) -; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) -; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) -; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) -; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) -; SSE2-NEXT: [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[LD8]]) -; SSE2-NEXT: [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[LD9]]) -; SSE2-NEXT: [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[LD10]]) -; SSE2-NEXT: [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[LD11]]) -; SSE2-NEXT: [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[LD12]]) -; SSE2-NEXT: [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]]) -; SSE2-NEXT: [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]]) -; SSE2-NEXT: [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @ceil_16f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @ceil_16f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]]) +; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @ceil_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -1136,32 +752,11 @@ } define void @floor_4f32() #0 { -; SSE2-LABEL: @floor_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @floor_4f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @floor_4f32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) -; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; AVX-NEXT: ret void +; CHECK-LABEL: @floor_4f32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; CHECK-NEXT: ret void ; %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 @@ -1179,41 +774,14 @@ } define void @floor_8f32() #0 { -; SSE2-LABEL: @floor_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) -; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) -; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) -; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) -; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @floor_8f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @floor_8f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX-LABEL: @floor_8f32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -1249,71 +817,20 @@ } define void @floor_16f32() #0 { -; SSE2-LABEL: @floor_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) -; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) -; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) -; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) -; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) -; SSE2-NEXT: [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[LD8]]) -; SSE2-NEXT: [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[LD9]]) -; SSE2-NEXT: [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[LD10]]) -; SSE2-NEXT: [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[LD11]]) -; SSE2-NEXT: [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[LD12]]) -; SSE2-NEXT: [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]]) -; SSE2-NEXT: [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]]) -; SSE2-NEXT: [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @floor_16f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @floor_16f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]]) +; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @floor_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -1391,32 +908,11 @@ } define void @nearbyint_4f32() #0 { -; SSE2-LABEL: @nearbyint_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @nearbyint_4f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @nearbyint_4f32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) -; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; AVX-NEXT: ret void +; CHECK-LABEL: @nearbyint_4f32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; CHECK-NEXT: ret void ; %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 @@ -1434,41 +930,14 @@ } define void @nearbyint_8f32() #0 { -; SSE2-LABEL: @nearbyint_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) -; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) -; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) -; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) -; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @nearbyint_8f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @nearbyint_8f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX-LABEL: @nearbyint_8f32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -1504,71 +973,20 @@ } define void @nearbyint_16f32() #0 { -; SSE2-LABEL: @nearbyint_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) -; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) -; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) -; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) -; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) -; SSE2-NEXT: [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[LD8]]) -; SSE2-NEXT: [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[LD9]]) -; SSE2-NEXT: [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[LD10]]) -; SSE2-NEXT: [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[LD11]]) -; SSE2-NEXT: [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[LD12]]) -; SSE2-NEXT: [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]]) -; SSE2-NEXT: [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]]) -; SSE2-NEXT: [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @nearbyint_16f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @nearbyint_16f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]]) +; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @nearbyint_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -1646,32 +1064,11 @@ } define void @rint_4f32() #0 { -; SSE2-LABEL: @rint_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @rint_4f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @rint_4f32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) -; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; AVX-NEXT: ret void +; CHECK-LABEL: @rint_4f32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; CHECK-NEXT: ret void ; %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 @@ -1689,41 +1086,14 @@ } define void @rint_8f32() #0 { -; SSE2-LABEL: @rint_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) -; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) -; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) -; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) -; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @rint_8f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @rint_8f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX-LABEL: @rint_8f32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -1759,71 +1129,20 @@ } define void @rint_16f32() #0 { -; SSE2-LABEL: @rint_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) -; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) -; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) -; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) -; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) -; SSE2-NEXT: [[RINT8:%.*]] = call float @llvm.rint.f32(float [[LD8]]) -; SSE2-NEXT: [[RINT9:%.*]] = call float @llvm.rint.f32(float [[LD9]]) -; SSE2-NEXT: [[RINT10:%.*]] = call float @llvm.rint.f32(float [[LD10]]) -; SSE2-NEXT: [[RINT11:%.*]] = call float @llvm.rint.f32(float [[LD11]]) -; SSE2-NEXT: [[RINT12:%.*]] = call float @llvm.rint.f32(float [[LD12]]) -; SSE2-NEXT: [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]]) -; SSE2-NEXT: [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]]) -; SSE2-NEXT: [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @rint_16f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @rint_16f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]]) +; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @rint_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -1901,32 +1220,11 @@ } define void @trunc_4f32() #0 { -; SSE2-LABEL: @trunc_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @trunc_4f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: ret void -; -; AVX-LABEL: @trunc_4f32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) -; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; AVX-NEXT: ret void +; CHECK-LABEL: @trunc_4f32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; CHECK-NEXT: ret void ; %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 @@ -1944,41 +1242,14 @@ } define void @trunc_8f32() #0 { -; SSE2-LABEL: @trunc_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) -; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) -; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) -; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) -; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @trunc_8f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @trunc_8f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX-LABEL: @trunc_8f32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 @@ -2014,71 +1285,20 @@ } define void @trunc_16f32() #0 { -; SSE2-LABEL: @trunc_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) -; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) -; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) -; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) -; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) -; SSE2-NEXT: [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[LD8]]) -; SSE2-NEXT: [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[LD9]]) -; SSE2-NEXT: [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[LD10]]) -; SSE2-NEXT: [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[LD11]]) -; SSE2-NEXT: [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[LD12]]) -; SSE2-NEXT: [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]]) -; SSE2-NEXT: [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]]) -; SSE2-NEXT: [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 -; SSE2-NEXT: ret void -; -; SSE41-LABEL: @trunc_16f32( -; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: ret void +; SSE-LABEL: @trunc_16f32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) +; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]]) +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]]) +; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @trunc_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 Index: test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/powof2div.ll +++ test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -58,59 +58,26 @@ } define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ -; AVX1-LABEL: @powof2div_nonuniform( -; AVX1-NEXT: entry: -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 -; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 -; AVX1-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; AVX1-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; AVX1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] -; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; AVX1-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 -; AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; AVX1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 -; AVX1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; AVX1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; AVX1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 -; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; AVX1-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 -; AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; AVX1-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 -; AVX1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; AVX1-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 -; AVX1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] -; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 -; AVX1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; AVX1-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @powof2div_nonuniform( -; AVX2-NEXT: entry: -; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 -; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 -; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 -; AVX2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; AVX2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; AVX2-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; AVX2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; AVX2-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; AVX2-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; AVX2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] -; AVX2-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], -; AVX2-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 -; AVX2-NEXT: ret void +; CHECK-LABEL: @powof2div_nonuniform( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: ret void ; entry: %0 = load i32, i32* %b, align 4 Index: test/Transforms/SLPVectorizer/X86/shift-ashr.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -22,85 +22,34 @@ define void @ashr_v8i64() { ; SSE-LABEL: @ashr_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]] -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = ashr <2 x i64> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = ashr <2 x i64> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = ashr <2 x i64> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = ashr <2 x i64> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ashr_v8i64( -; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]] -; AVX1-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]] -; AVX1-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]] -; AVX1-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]] -; AVX1-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]] -; AVX1-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]] -; AVX1-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]] -; AVX1-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]] -; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @ashr_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]] -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: ret void +; AVX-LABEL: @ashr_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @ashr_v8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 @@ -157,101 +106,34 @@ define void @ashr_v16i32() { ; SSE-LABEL: @ashr_v16i32( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = ashr i32 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = ashr i32 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = ashr i32 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = ashr i32 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = ashr i32 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = ashr i32 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = ashr i32 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = ashr i32 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = ashr i32 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = ashr i32 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = ashr i32 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = ashr i32 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = ashr i32 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = ashr i32 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = ashr i32 [[A15]], [[B15]] -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ashr_v16i32( -; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @ashr_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]] -; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: ret void +; AVX-LABEL: @ashr_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @ashr_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 @@ -340,134 +222,22 @@ define void @ashr_v32i16() { ; SSE-LABEL: @ashr_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = ashr i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = ashr i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = ashr i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = ashr i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = ashr i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = ashr i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = ashr i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = ashr i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = ashr i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = ashr i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = ashr i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = ashr i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = ashr i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = ashr i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = ashr i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = ashr i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = ashr i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = ashr i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = ashr i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = ashr i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = ashr i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = ashr i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = ashr i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = ashr i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = ashr i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = ashr i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = ashr i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = ashr i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = ashr i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = ashr i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = ashr i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = ashr i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = ashr <8 x i16> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = ashr <8 x i16> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = ashr <8 x i16> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @ashr_v32i16( Index: test/Transforms/SLPVectorizer/X86/shift-lshr.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -40,35 +40,16 @@ ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @lshr_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @lshr_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]] -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: ret void +; AVX-LABEL: @lshr_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @lshr_v8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 @@ -125,70 +106,22 @@ define void @lshr_v16i32() { ; SSE-LABEL: @lshr_v16i32( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = lshr i32 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = lshr i32 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = lshr i32 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = lshr i32 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = lshr i32 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = lshr i32 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = lshr i32 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = lshr i32 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = lshr i32 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = lshr i32 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = lshr i32 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = lshr i32 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = lshr i32 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = lshr i32 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = lshr i32 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = lshr i32 [[A15]], [[B15]] -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = lshr <4 x i32> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = lshr <4 x i32> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v16i32( @@ -289,134 +222,22 @@ define void @lshr_v32i16() { ; SSE-LABEL: @lshr_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = lshr i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = lshr i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = lshr i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = lshr i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = lshr i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = lshr i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = lshr i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = lshr i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = lshr i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = lshr i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = lshr i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = lshr i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = lshr i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = lshr i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = lshr i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = lshr i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = lshr i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = lshr i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = lshr i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = lshr i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = lshr i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = lshr i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = lshr i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = lshr i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = lshr i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = lshr i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = lshr i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = lshr i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = lshr i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = lshr i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = lshr i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = lshr i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = lshr <8 x i16> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = lshr <8 x i16> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = lshr <8 x i16> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = lshr <8 x i16> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v32i16( Index: test/Transforms/SLPVectorizer/X86/shift-shl.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -40,35 +40,16 @@ ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @shl_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @shl_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]] -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: ret void +; AVX-LABEL: @shl_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @shl_v8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 @@ -241,134 +222,22 @@ define void @shl_v32i16() { ; SSE-LABEL: @shl_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = shl i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = shl i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = shl i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = shl i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = shl i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = shl i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = shl i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = shl i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = shl i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = shl i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = shl i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = shl i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = shl i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = shl i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = shl i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = shl i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = shl i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = shl i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = shl i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = shl i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = shl i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = shl i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = shl i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = shl i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = shl i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = shl i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = shl i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = shl i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = shl i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = shl i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = shl i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = shl i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = shl <8 x i16> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = shl <8 x i16> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = shl <8 x i16> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = shl <8 x i16> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v32i16( Index: test/Transforms/SLPVectorizer/X86/sitofp.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/sitofp.ll +++ test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -1,8 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ @@ -21,36 +18,21 @@ ; define void @sitofp_2i64_2f64() #0 { -; SSE-LABEL: @sitofp_2i64_2f64( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: ret void +; CHECK-LABEL: @sitofp_2i64_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; CHECK-NEXT: ret void ; ; AVX256NODQ-LABEL: @sitofp_2i64_2f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 ; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double ; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_2i64_2f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> -; AVX512-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_2i64_2f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> -; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %cvt0 = sitofp i64 %ld0 to double @@ -62,20 +44,20 @@ define void @sitofp_4i64_4f64() #0 { ; SSE-LABEL: @sitofp_4i64_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i64> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX-LABEL: @sitofp_4i64_4f64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_4i64_4f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -85,24 +67,12 @@ ; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double ; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double ; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_4i64_4f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX512-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_4i64_4f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX256DQ-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -120,32 +90,35 @@ define void @sitofp_8i64_8f64() #0 { ; SSE-LABEL: @sitofp_8i64_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; SSE-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; SSE-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i64 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i64> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i64> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i64> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i64_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; +; AVX256-LABEL: @sitofp_8i64_8f64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 +; AVX256-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_8i64_8f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -163,31 +136,17 @@ ; AVX256NODQ-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to double ; AVX256NODQ-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to double ; AVX256NODQ-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = insertelement <4 x double> undef, double [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i64_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_8i64_8f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; AVX256DQ-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX256DQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double> -; AVX256DQ-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 -; AVX256DQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -217,12 +176,9 @@ define void @sitofp_2i32_2f64() #0 { ; CHECK-LABEL: @sitofp_2i32_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i32 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i32 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 @@ -236,18 +192,12 @@ define void @sitofp_4i32_4f64() #0 { ; SSE-LABEL: @sitofp_4i32_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i32 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i32 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i32 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i32 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i32_4f64( @@ -273,32 +223,26 @@ define void @sitofp_8i32_8f64() #0 { ; SSE-LABEL: @sitofp_8i32_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16 -; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8 -; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i32 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i32 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i32 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i32 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i32 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i32 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i32 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i32 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i32_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_8i32_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 @@ -307,12 +251,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i32_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 @@ -343,12 +281,9 @@ define void @sitofp_2i16_2f64() #0 { ; CHECK-LABEL: @sitofp_2i16_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 @@ -362,18 +297,12 @@ define void @sitofp_4i16_4f64() #0 { ; SSE-LABEL: @sitofp_4i16_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i16_4f64( @@ -399,32 +328,26 @@ define void @sitofp_8i16_8f64() #0 { ; SSE-LABEL: @sitofp_8i16_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i16_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_8i16_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 @@ -433,12 +356,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i16_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -469,12 +386,9 @@ define void @sitofp_2i8_2f64() #0 { ; CHECK-LABEL: @sitofp_2i8_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 @@ -488,18 +402,12 @@ define void @sitofp_4i8_4f64() #0 { ; SSE-LABEL: @sitofp_4i8_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i8 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i8_4f64( @@ -525,32 +433,26 @@ define void @sitofp_8i8_8f64() #0 { ; SSE-LABEL: @sitofp_8i8_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4 -; SSE-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1 -; SSE-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i8 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i8 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i8 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i8 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i8 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_8i8_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_8i8_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 @@ -559,12 +461,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i8_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 @@ -617,48 +513,27 @@ } define void @sitofp_4i64_4f32() #0 { -; SSE-LABEL: @sitofp_4i64_4f32( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: ret void +; CHECK-LABEL: @sitofp_4i64_4f32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; CHECK-NEXT: ret void ; ; AVX256NODQ-LABEL: @sitofp_4i64_4f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = sitofp <2 x i64> [[TMP2]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_4i64_4f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_4i64_4f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX256DQ-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -676,71 +551,43 @@ define void @sitofp_8i64_8f32() #0 { ; SSE-LABEL: @sitofp_8i64_8f32( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; SSE-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; SSE-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = sitofp i64 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX-LABEL: @sitofp_8i64_8f32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> +; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @sitofp_8i64_8f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: [[CVT4:%.*]] = sitofp i64 [[LD4]] to float -; AVX256NODQ-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to float -; AVX256NODQ-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to float -; AVX256NODQ-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <8 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 4 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 5 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 6 +; AVX256NODQ-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; AVX256NODQ-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 7 +; AVX256NODQ-NEXT: store <8 x float> [[TMP20]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_8i64_8f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX512-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_8i64_8f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX256DQ-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -849,6 +696,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_16i32_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_16i32_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 @@ -857,12 +710,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_16i32_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4 @@ -996,6 +843,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_16i16_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_16i16_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 @@ -1004,12 +857,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_16i16_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2 @@ -1143,6 +990,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @sitofp_16i8_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @sitofp_16i8_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 @@ -1151,12 +1004,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @sitofp_16i8_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1 Index: test/Transforms/SLPVectorizer/X86/uitofp.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/uitofp.ll +++ test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -1,8 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ @@ -83,6 +80,12 @@ ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i64_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i64_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 @@ -91,12 +94,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i64_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -126,36 +123,21 @@ } define void @uitofp_2i32_2f64() #0 { -; SSE-LABEL: @uitofp_2i32_2f64( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: ret void +; CHECK-LABEL: @uitofp_2i32_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; CHECK-NEXT: ret void ; ; AVX256NODQ-LABEL: @uitofp_2i32_2f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 ; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double ; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_2i32_2f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> -; AVX512-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_2i32_2f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> -; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 %cvt0 = uitofp i32 %ld0 to double @@ -167,18 +149,12 @@ define void @uitofp_4i32_4f64() #0 { ; SSE-LABEL: @uitofp_4i32_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i32 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i32 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i32_4f64( @@ -204,32 +180,26 @@ define void @uitofp_8i32_8f64() #0 { ; SSE-LABEL: @uitofp_8i32_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8 -; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16 -; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8 -; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i32 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i32 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = uitofp i32 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = uitofp i32 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = uitofp i32 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = uitofp i32 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i32> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i32_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i32_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 @@ -238,12 +208,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i32_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 @@ -274,12 +238,9 @@ define void @uitofp_2i16_2f64() #0 { ; CHECK-LABEL: @uitofp_2i16_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; CHECK-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 @@ -293,18 +254,12 @@ define void @uitofp_4i16_4f64() #0 { ; SSE-LABEL: @uitofp_4i16_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i16_4f64( @@ -330,32 +285,26 @@ define void @uitofp_8i16_8f64() #0 { ; SSE-LABEL: @uitofp_8i16_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i16_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i16_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 @@ -364,12 +313,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i16_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -399,36 +342,21 @@ } define void @uitofp_2i8_2f64() #0 { -; SSE-LABEL: @uitofp_2i8_2f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: ret void +; CHECK-LABEL: @uitofp_2i8_2f64( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; CHECK-NEXT: ret void ; ; AVX256NODQ-LABEL: @uitofp_2i8_2f64( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 ; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double ; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[CVT1]], i32 1 +; AVX256NODQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_2i8_2f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> -; AVX512-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_2i8_2f64( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> -; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 %cvt0 = uitofp i8 %ld0 to double @@ -440,18 +368,12 @@ define void @uitofp_4i8_4f64() #0 { ; SSE-LABEL: @uitofp_4i8_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i8 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i8_4f64( @@ -477,32 +399,26 @@ define void @uitofp_8i8_8f64() #0 { ; SSE-LABEL: @uitofp_8i8_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4 -; SSE-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1 -; SSE-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i8 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = uitofp i8 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = uitofp i8 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = uitofp i8 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = uitofp i8 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_8i8_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double> +; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_8i8_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 @@ -511,12 +427,6 @@ ; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i8_8f64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double> -; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 @@ -569,48 +479,27 @@ } define void @uitofp_4i64_4f32() #0 { -; SSE-LABEL: @uitofp_4i64_4f32( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: ret void +; CHECK-LABEL: @uitofp_4i64_4f32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; CHECK-NEXT: ret void ; ; AVX256NODQ-LABEL: @uitofp_4i64_4f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_4i64_4f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_4i64_4f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX256DQ-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -628,71 +517,43 @@ define void @uitofp_8i64_8f32() #0 { ; SSE-LABEL: @uitofp_8i64_8f32( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; SSE-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; SSE-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = uitofp i64 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = uitofp i64 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = uitofp i64 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = uitofp i64 [[LD7]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX-LABEL: @uitofp_8i64_8f32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> +; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX-NEXT: ret void +; ; AVX256NODQ-LABEL: @uitofp_8i64_8f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: [[CVT4:%.*]] = uitofp i64 [[LD4]] to float -; AVX256NODQ-NEXT: [[CVT5:%.*]] = uitofp i64 [[LD5]] to float -; AVX256NODQ-NEXT: [[CVT6:%.*]] = uitofp i64 [[LD6]] to float -; AVX256NODQ-NEXT: [[CVT7:%.*]] = uitofp i64 [[LD7]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float> +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = insertelement <8 x float> undef, float [[TMP5]], i32 0 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP7]], i32 1 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 2 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 3 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 4 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 5 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP17]], i32 6 +; AVX256NODQ-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; AVX256NODQ-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP18]], float [[TMP19]], i32 7 +; AVX256NODQ-NEXT: store <8 x float> [[TMP20]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_8i64_8f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX512-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_8i64_8f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> -; AVX256DQ-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -801,6 +662,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_16i32_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_16i32_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 @@ -809,12 +676,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_16i32_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4 @@ -948,6 +809,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_16i16_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_16i16_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 @@ -956,12 +823,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_16i16_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2 @@ -1095,6 +956,12 @@ ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; +; AVX512-LABEL: @uitofp_16i8_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float> +; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 +; AVX512-NEXT: ret void +; ; AVX256-LABEL: @uitofp_16i8_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 ; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 @@ -1103,12 +970,6 @@ ; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void -; -; AVX512-LABEL: @uitofp_16i8_16f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float> -; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64 -; AVX512-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1