Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -322,6 +322,7 @@ enum ShuffleKind { SK_Broadcast, ///< Broadcast element 0 to all other elements. SK_Reverse, ///< Reverse the order of the vector. + SK_Alternate, ///< Choose alternate elements from vector. SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset. }; Index: lib/CodeGen/BasicTargetTransformInfo.cpp =================================================================== --- lib/CodeGen/BasicTargetTransformInfo.cpp +++ lib/CodeGen/BasicTargetTransformInfo.cpp @@ -39,6 +39,9 @@ /// are set if the result needs to be inserted and/or extracted from vectors. unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + /// Estimate the cost overhead of SK_Alternate shuffle. + unsigned getAltShuffleOverhead(Type *Ty) const; + const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); } public: @@ -327,8 +330,28 @@ return OpCost; } +unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const { + assert(Ty->isVectorTy() && "Can only shuffle vectors"); + unsigned Cost = 0; + // Shuffle cost is equal to the cost of extracting element from its argument + // plus the cost of inserting them onto the result vector. + + // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index + // 0 of first vector, index 1 of second vector,index 2 of first vector and + // finally index 3 of second vector and insert them at index <0,1,2,3> of + // result vector. + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + return Cost; +} + unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { + if (Kind == SK_Alternate) { + return getAltShuffleOverhead(Tp); + } return 1; } Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -443,31 +443,58 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only handle costs of reverse shuffles for now. - if (Kind != SK_Reverse) + // We only handle costs of reverse and alternate shuffles for now. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - static const CostTblEntry NEONShuffleTbl[] = { - // Reverse shuffle cost one instruction if we are shuffling within a double - // word (vrev) or two if we shuffle a quad word (vrev, vext). - { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, - - { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 } - }; - - std::pair LT = TLI->getTypeLegalizationCost(Tp); - - int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - - return LT.first * NEONShuffleTbl[Idx].Cost; + if (Kind == SK_Reverse) { + static const CostTblEntry NEONShuffleTbl[] = { + // Reverse shuffle cost one instruction if we are shuffling within a + // double word (vrev) or two if we shuffle a quad word (vrev, vext). + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; + + std::pair LT = TLI->getTypeLegalizationCost(Tp); + + int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + + return LT.first * NEONShuffleTbl[Idx].Cost; + } + if (Kind == SK_Alternate) { + static const CostTblEntry NEONAltShuffleTbl[] = { + // Alt shuffle cost table for ARM. Cost is the number of instructions + // required to create the shuffled vector. + + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, + + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; + + std::pair LT = TLI->getTypeLegalizationCost(Tp); + int Idx = + CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return LT.first * NEONAltShuffleTbl[Idx].Cost; + } + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -402,17 +402,47 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only estimate the cost of reverse shuffles. - if (Kind != SK_Reverse) + // We only estimate the cost of reverse and alternate shuffles. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - std::pair LT = TLI->getTypeLegalizationCost(Tp); - unsigned Cost = 1; - if (LT.second.getSizeInBits() > 128) - Cost = 3; // Extract + insert + copy. + if (Kind == SK_Reverse) { + std::pair LT = TLI->getTypeLegalizationCost(Tp); + unsigned Cost = 1; + if (LT.second.getSizeInBits() > 128) + Cost = 3; // Extract + insert + copy. - // Multiple by the number of parts. - return Cost * LT.first; + // Multiple by the number of parts. + return Cost * LT.first; + } + + if (Kind == SK_Alternate) { + static const CostTblEntry X86AltShuffleTbl[] = { + // Alt shuffle cost table for X86. Cost is the number of instructions + // required to create the shuffled vector. + + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 8}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, + + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 49}}; + + std::pair LT = TLI->getTypeLegalizationCost(Tp); + + int Idx = CostTableLookup(X86AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return LT.first * X86AltShuffleTbl[Idx].Cost; + } + + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -149,6 +149,48 @@ return true; } +///\returns Opcode that can be clubbed with \p Op to create an alternate +/// sequence which can later be merged as a ShuffleVector instruction. +static unsigned getAltOpcode(unsigned Op) { + switch (Op) { + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + default: + return 0; + } +} + +///\returns bool representing if Opcode \p Op can be part +/// of an alternate sequence which can later be merged as +/// a ShuffleVector instruction. +static bool canCombineAsAltInst(unsigned Op) { + if (Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add) + return true; + return false; +} + +/// \returns ShuffleVector instruction if intructions in \p VL have +/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. +/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) +static unsigned isAltInst(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) + return 0; + } + return Instruction::ShuffleVector; +} + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef VL) { @@ -158,8 +200,11 @@ unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast(VL[i]); - if (!I || Opcode != I->getOpcode()) + if (!I || Opcode != I->getOpcode()) { + if (canCombineAsAltInst(Opcode) && i == 1) + return isAltInst(VL); return 0; + } } return Opcode; } @@ -377,6 +422,7 @@ /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + private: struct TreeEntry; @@ -594,6 +640,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; + bool isAltShuffle = false; assert(SameTy && "Invalid types!"); if (Depth == RecursionMaxDepth) { @@ -615,10 +662,19 @@ newTreeEntry(VL, false); return; } + unsigned Opcode = getSameOpcode(VL); + + // Check that this shuffle vector refers to the alternate + // sequence of opcodes. + if (Opcode == Instruction::ShuffleVector) { + Instruction *I0 = dyn_cast(VL[0]); + unsigned Op = I0->getOpcode(); + if (Op != Instruction::ShuffleVector) + isAltShuffle = true; + } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || - !getSameOpcode(VL)) { + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -754,8 +810,6 @@ DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned Opcode = getSameOpcode(VL); - // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { Instruction *Last = getLastInstruction(VL); @@ -1057,6 +1111,26 @@ } return; } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!isAltShuffle) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return; + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } default: newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); @@ -1080,11 +1154,9 @@ } return getGatherCost(E->Scalars); } - - assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && - "Invalid VL"); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast(VL[0]); - unsigned Opcode = VL0->getOpcode(); switch (Opcode) { case Instruction::PHI: { return 0; @@ -1242,6 +1314,32 @@ return VecCallCost - ScalarCallCost; } + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (unsigned i = 0; i < VL.size(); ++i) { + Instruction *I = cast(VL[i]); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } default: llvm_unreachable("Unknown instruction"); } @@ -1522,9 +1620,7 @@ setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { @@ -1797,6 +1893,49 @@ E->VectorizedValue = V; return V; } + case Instruction::ShuffleVector: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); + } + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + // Create a vector of LHS op1 RHS + BinaryOperator *BinOp0 = cast(VL0); + Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + + // Create a vector of LHS op2 RHS + Instruction *VL1 = cast(E->Scalars[1]); + BinaryOperator *BinOp1 = cast(VL1); + Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + + // Create appropriate shuffle to take alternative operations from + // the vector. + std::vector Mask(E->Scalars.size()); + unsigned e = E->Scalars.size(); + for (unsigned i = 0; i < e; ++i) { + if (i & 1) + Mask[i] = Builder.getInt32(e + i); + else + Mask[i] = Builder.getInt32(i); + } + + Value *ShuffleMask = ConstantVector::get(Mask); + + Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + E->VectorizedValue = V; + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } default: llvm_unreachable("unknown inst"); } @@ -1865,7 +2004,6 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. if (Entry->NeedToGather) continue; @@ -2049,7 +2187,6 @@ for (po_iterator it = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); it != e; ++it) { BasicBlock *BB = *it; - // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; Index: test/Transforms/SLPVectorizer/X86/addsub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/addsub.ll +++ test/Transforms/SLPVectorizer/X86/addsub.ll @@ -0,0 +1,193 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common global [4 x i32] zeroinitializer, align 16 +@c = common global [4 x i32] zeroinitializer, align 16 +@d = common global [4 x i32] zeroinitializer, align 16 +@e = common global [4 x i32] zeroinitializer, align 16 +@a = common global [4 x i32] zeroinitializer, align 16 +@fb = common global [4 x float] zeroinitializer, align 16 +@fc = common global [4 x float] zeroinitializer, align 16 +@fa = common global [4 x float] zeroinitializer, align 16 + +; CHECK-LABEL: @addsub +; CHECK: %4 = add <4 x i32> %1, %0 +; CHECK: %5 = add <4 x i32> %4, %2 +; CHECK: %6 = sub <4 x i32> %4, %2 +; CHECK: %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> +; CHECK: %8 = add <4 x i32> %7, %3 +; CHECK: %9 = sub <4 x i32> %7, %3 +; CHECK: %10 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> + +; Function Attrs: nounwind uwtable +define void @addsub() #0 { +entry: + %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 0), align 16, !tbaa !1 + %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 0), align 16, !tbaa !1 + %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 0), align 16, !tbaa !1 + %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 0), align 16, !tbaa !1 + %add1 = add i32 %1, %0 + %add = add i32 %add1, %2 + %add2 = add i32 %add, %3 + store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 0), align 16, !tbaa !1 + %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 1), align 4, !tbaa !1 + %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 1), align 4, !tbaa !1 + %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 1), align 4, !tbaa !1 + %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 1), align 4, !tbaa !1 + %add4.neg = add i32 %5, %4 + %add3 = sub i32 %add4.neg, %6 + %sub = sub i32 %add3, %7 + store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 1), align 4, !tbaa !1 + %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 2), align 8, !tbaa !1 + %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 2), align 8, !tbaa !1 + %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 2), align 8, !tbaa !1 + %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 2), align 8, !tbaa !1 + %add6 = add i32 %9, %8 + %add5 = add i32 %add6, %10 + %add7 = add i32 %add5, %11 + store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 2), align 8, !tbaa !1 + %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 3), align 4, !tbaa !1 + %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 3), align 4, !tbaa !1 + %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 3), align 4, !tbaa !1 + %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 3), align 4, !tbaa !1 + %add9.neg = add i32 %13, %12 + %add8 = sub i32 %add9.neg, %14 + %sub10 = sub i32 %add8, %15 + store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 3), align 4, !tbaa !1 + ret void +} + +; CHECK-LABEL: @subadd +; CHECK: %4 = add <4 x i32> +; CHECK: %5 = sub <4 x i32> %4, %2 +; CHECK: %6 = add <4 x i32> %4, %2 +; CHECK: %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> +; CHECK: %8 = sub <4 x i32> %7, %3 +; CHECK: %9 = add <4 x i32> %7, %3 +; CHECK: %10 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> + +; Function Attrs: nounwind uwtable +define void @subadd() #0 { +entry: + %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 0), align 16, !tbaa !1 + %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 0), align 16, !tbaa !1 + %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 0), align 16, !tbaa !1 + %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 0), align 16, !tbaa !1 + %add1.neg = add i32 %1, %0 + %add = sub i32 %add1.neg, %2 + %sub = sub i32 %add, %3 + store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 0), align 16, !tbaa !1 + %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 1), align 4, !tbaa !1 + %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 1), align 4, !tbaa !1 + %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 1), align 4, !tbaa !1 + %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 1), align 4, !tbaa !1 + %add3 = add i32 %5, %4 + %add2 = add i32 %add3, %6 + %add4 = add i32 %add2, %7 + store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 1), align 4, !tbaa !1 + %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 2), align 8, !tbaa !1 + %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 2), align 8, !tbaa !1 + %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 2), align 8, !tbaa !1 + %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 2), align 8, !tbaa !1 + %add6.neg = add i32 %9, %8 + %add5 = sub i32 %add6.neg, %10 + %sub7 = sub i32 %add5, %11 + store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 2), align 8, !tbaa !1 + %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 3), align 4, !tbaa !1 + %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 3), align 4, !tbaa !1 + %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 3), align 4, !tbaa !1 + %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 3), align 4, !tbaa !1 + %add9 = add i32 %13, %12 + %add8 = add i32 %add9, %14 + %add10 = add i32 %add8, %15 + store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 3), align 4, !tbaa !1 + ret void +} + +; CHECK-LABEL: @faddfsub +; CHECK: %2 = fadd <4 x float> %0, %1 +; CHECK: %3 = fsub <4 x float> %0, %1 +; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> +; Function Attrs: nounwind uwtable +define void @faddfsub() #0 { +entry: + %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5 + %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5 + %add = fadd float %0, %1 + store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5 + %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5 + %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5 + %sub = fsub float %2, %3 + store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5 + %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5 + %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5 + %add1 = fadd float %4, %5 + store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5 + %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5 + %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5 + %sub2 = fsub float %6, %7 + store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5 + ret void +} + +; CHECK-LABEL: @fsubfadd +; CHECK: %2 = fsub <4 x float> %0, %1 +; CHECK: %3 = fadd <4 x float> %0, %1 +; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> +; Function Attrs: nounwind uwtable +define void @fsubfadd() #0 { +entry: + %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5 + %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5 + %sub = fsub float %0, %1 + store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5 + %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5 + %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5 + %add = fadd float %2, %3 + store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5 + %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5 + %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5 + %sub1 = fsub float %4, %5 + store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5 + %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5 + %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5 + %add2 = fadd float %6, %7 + store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5 + ret void +} + +; CHECK-LABEL: @No_faddfsub +; CHECK-NOT: fadd <4 x float> +; CHECK-NOT: fsub <4 x float> +; CHECK-NOT: shufflevector +; Function Attrs: nounwind uwtable +define void @No_faddfsub() #0 { +entry: + %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5 + %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5 + %add = fadd float %0, %1 + store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5 + %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5 + %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5 + %add1 = fadd float %2, %3 + store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5 + %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5 + %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5 + %add2 = fadd float %4, %5 + store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5 + %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5 + %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5 + %sub = fsub float %6, %7 + store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5 + ret void +} + +attributes #0 = { nounwind} + +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"int", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} +!5 = metadata !{metadata !6, metadata !6, i64 0} +!6 = metadata !{metadata !"float", metadata !3, i64 0}