diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1297,6 +1297,41 @@ LT.first = NumOfDests * NumOfShufflesPerDest; } + // Add-sub pattern. + if (Kind == TTI::SK_Select && Args.size() == 2 && isa(Args[0]) && + isa(Args[1]) && LT.second.isVector()) { + unsigned EvenOpcode = cast(Args[0])->getOpcode(); + unsigned OddOpcode = cast(Args[1])->getOpcode(); + unsigned Lanes = LT.second.getVectorNumElements(); + auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), Lanes); + if (isLegalAltInstr(VecTy, EvenOpcode, OddOpcode)) { + static const CostTblEntry SSE3AddSubTable[] = { + {TTI::SK_Select, MVT::v4f32, 1}, // addsubps + {TTI::SK_Select, MVT::v2f64, 1}, // addsubpd + }; + static const CostTblEntry AVXAddSubTable[] = { + {TTI::SK_Select, MVT::v4f32, 1}, // vaddsubps + {TTI::SK_Select, MVT::v2f64, 1}, // vaddsubpd + }; + static const CostTblEntry AVX2AddSubTable[] = { + {TTI::SK_Select, MVT::v8f32, 1}, // vaddsubps + {TTI::SK_Select, MVT::v4f64, 1}, // vaddsubpd + }; + if (ST->hasSSE3()) + if (const auto *Entry = + CostTableLookup(SSE3AddSubTable, Kind, LT.second)) + return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = + CostTableLookup(AVXAddSubTable, Kind, LT.second)) + return LT.first * Entry->Cost; + if (ST->hasAVX2()) + if (const auto *Entry = + CostTableLookup(AVX2AddSubTable, Kind, LT.second)) + return LT.first * Entry->Cost; + } + } + static const CostTblEntry AVX512FP16ShuffleTbl[] = { {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -939,6 +939,10 @@ /// insertelement nodes, otherwise skip them. Optional getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// \Returns true if \p TE is an alt-shuffle that can be lowered to a single + /// instruction. An example of this is the X86 addsub instruction. + bool isAltShuffleThatLowersToOneInstr(const TreeEntry *TE) const; + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -3617,6 +3621,18 @@ return None; } +bool BoUpSLP::isAltShuffleThatLowersToOneInstr(const TreeEntry *TE) const { + if (TE->isAltShuffleWithRepeatingEvenOddOpcodes()) { + VectorType *VecTy = + FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); + unsigned EvenOpcode = TE->getOpcode(); + unsigned OddOpcode = TE->getAltOpcode(); + if (TTI->isLegalAltInstr(VecTy, EvenOpcode, OddOpcode)) + return true; + } + return false; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -3649,15 +3665,9 @@ // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. - if (TE->isAltShuffleWithRepeatingEvenOddOpcodes()) { - VectorType *VecTy = - FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); - unsigned EvenOpcode = TE->getOpcode(); - unsigned OddOpcode = TE->getAltOpcode(); - if (TTI->isLegalAltInstr(VecTy, EvenOpcode, OddOpcode)) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); - } + if (isAltShuffleThatLowersToOneInstr(TE.get())) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); // TODO: Check the reverse order too. } @@ -6277,8 +6287,10 @@ ScalarCost += TTI->getInstructionCost(I, CostKind); } // VecCost is equal to sum of the cost of creating 2 vectors - // and the cost of creating shuffle. + // and the cost of creating shuffle, except if this is an alternate + // sequence that can be lowered to a single instruction, like x86 addsub. InstructionCost VecCost = 0; + bool LowersToOneInstr = isAltShuffleThatLowersToOneInstr(E); // Try to find the previous shuffle node with the same operands and same // main/alternate ops. auto &&TryFindNodeWithEqualOperands = [this, E]() { @@ -6303,9 +6315,12 @@ // No need to add new vector costs here since we're going to reuse // same main/alternate vector ops, just do different shuffling. } else if (Instruction::isBinaryOp(E->getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, - CostKind); + if (!LowersToOneInstr) { + VecCost = + TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); + VecCost += + TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); + } } else if (auto *CI0 = dyn_cast(VL0)) { VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), @@ -6326,8 +6341,10 @@ } if (E->ReuseShuffleIndices.empty()) { - CommonCost = - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); + ArrayRef Args = {VL[0], VL[1]}; + CommonCost = TTI->getShuffleCost( + TargetTransformInfo::SK_Select, FinalVecTy, /*Mask=*/None, + /*Index=*/0, /*SubTp=*/nullptr, /*Args=*/ Args); } else { SmallVector Mask; buildShuffleEntryMask( diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-add-sub.ll b/llvm/test/Analysis/CostModel/X86/shuffle-add-sub.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-add-sub.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-add-sub.ll @@ -279,16 +279,16 @@ ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %addsub_32xf16 = shufflevector <32 x half> %sub_32xf16, <32 x half> %add_32xf16, <32 x i32> ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf32 = fadd <2 x float> undef, undef ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf32 = fsub <2 x float> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_4xf32 = fadd <4 x float> undef, undef ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_4xf32 = fsub <4 x float> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> ; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %add_8xf32 = fadd <8 x float> undef, undef ; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sub_8xf32 = fsub <8 x float> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> ; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %add_16xf32 = fadd <16 x float> undef, undef ; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sub_16xf32 = fsub <16 x float> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf64 = fadd <2 x double> undef, undef ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf64 = fsub <2 x double> undef, undef ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf64 = shufflevector <2 x double> %sub_2xf64, <2 x double> %add_2xf64, <2 x i32> @@ -372,16 +372,16 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %addsub_32xf16 = shufflevector <32 x half> %sub_32xf16, <32 x half> %add_32xf16, <32 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf32 = fadd <2 x float> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf32 = fsub <2 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_4xf32 = fadd <4 x float> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_4xf32 = fsub <4 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %add_8xf32 = fadd <8 x float> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sub_8xf32 = fsub <8 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %add_16xf32 = fadd <16 x float> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sub_16xf32 = fsub <16 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf64 = fadd <2 x double> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf64 = fsub <2 x double> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf64 = shufflevector <2 x double> %sub_2xf64, <2 x double> %add_2xf64, <2 x i32> @@ -465,16 +465,16 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %addsub_32xf16 = shufflevector <32 x half> %sub_32xf16, <32 x half> %add_32xf16, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf32 = fadd <2 x float> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf32 = fsub <2 x float> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_4xf32 = fadd <4 x float> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_4xf32 = fsub <4 x float> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %add_8xf32 = fadd <8 x float> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sub_8xf32 = fsub <8 x float> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %add_16xf32 = fadd <16 x float> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sub_16xf32 = fsub <16 x float> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf64 = fadd <2 x double> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf64 = fsub <2 x double> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf64 = shufflevector <2 x double> %sub_2xf64, <2 x double> %add_2xf64, <2 x i32> @@ -558,16 +558,16 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %addsub_32xf16 = shufflevector <32 x half> %sub_32xf16, <32 x half> %add_32xf16, <32 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf32 = fadd <2 x float> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf32 = fsub <2 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf32 = shufflevector <2 x float> %sub_2xf32, <2 x float> %add_2xf32, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_4xf32 = fadd <4 x float> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_4xf32 = fsub <4 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_4xf32 = shufflevector <4 x float> %sub_4xf32, <4 x float> %add_4xf32, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %add_8xf32 = fadd <8 x float> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sub_8xf32 = fsub <8 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %addsub_8xf32 = shufflevector <8 x float> %sub_8xf32, <8 x float> %add_8xf32, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %add_16xf32 = fadd <16 x float> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sub_16xf32 = fsub <16 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %addsub_16xf32 = shufflevector <16 x float> %sub_16xf32, <16 x float> %add_16xf32, <16 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_2xf64 = fadd <2 x double> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub_2xf64 = fsub <2 x double> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %addsub_2xf64 = shufflevector <2 x double> %sub_2xf64, <2 x double> %add_2xf64, <2 x i32> @@ -581,6 +581,7 @@ ; + %add_2xi8 = add <2 x i8> undef, undef %sub_2xi8 = sub <2 x i8> undef, undef %addsub_2xi8 = shufflevector <2 x i8> %sub_2xi8, <2 x i8> %add_2xi8, <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -125,23 +125,27 @@ ; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 ; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 ; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 ; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 -; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] -; ENABLED-NEXT: [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]] -; ENABLED-NEXT: [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]] -; ENABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]] -; ENABLED-NEXT: store double [[SUB0]], double* [[IDXS0]], align 8 -; ENABLED-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 +; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] +; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]] +; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> +; ENABLED-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; ENABLED-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] +; ENABLED-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP8]] +; ENABLED-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x i32> +; ENABLED-NEXT: [[TMP12:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 ; ENABLED-NEXT: ret void ; entry: