Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -322,6 +322,7 @@ enum ShuffleKind { SK_Broadcast, ///< Broadcast element 0 to all other elements. SK_Reverse, ///< Reverse the order of the vector. + SK_Alternate, ///< Choose alternate elements from vector. SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset. }; Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -569,6 +569,10 @@ unsigned getShuffleCost(ShuffleKind Kind, Type *Ty, int Index = 0, Type *SubTp = nullptr) const override { + // We generate 2 instructions to represent SK_Alternate + // shuffle.Return cost as 2. + if (Kind == SK_Alternate) + return 2; return 1; } Index: lib/CodeGen/BasicTargetTransformInfo.cpp =================================================================== --- lib/CodeGen/BasicTargetTransformInfo.cpp +++ lib/CodeGen/BasicTargetTransformInfo.cpp @@ -329,6 +329,10 @@ unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { + // We generate 2 instructions to represent SK_Alternate + // shuffle.Return cost as 2. + if (Kind == SK_Alternate) + return 2; return 1; } Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -149,6 +149,50 @@ return true; } +///\returns Opcode that can be clubbed with \p Op to create an alternate +/// sequence which can later be merged as a ShuffleVector instruction. +static unsigned getAltOpcode(unsigned Op) { + switch(Op) { + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + default: + return 0; + } +} + +///\returns bool representing if Opcode \p Op can be part +/// of an alternate sequence which can later be merged as +/// a ShuffleVector instruction. +static bool canCombineAsAltInst(unsigned Op) { + if (Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add) + return true; + return false; +} + +/// \returns ShuffleVector instruction if intructions in \p VL have +/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. +/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) +static unsigned isAltInst(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I || + I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) + return 0; + } + return Instruction::ShuffleVector; +} + + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef VL) { @@ -158,8 +202,11 @@ unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast(VL[i]); - if (!I || Opcode != I->getOpcode()) + if (!I || Opcode != I->getOpcode()) { + if (canCombineAsAltInst(Opcode) && i == 1) + return isAltInst(VL); return 0; + } } return Opcode; } @@ -377,6 +424,7 @@ /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + private: struct TreeEntry; @@ -594,6 +642,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; + bool isAltShuffle = false; assert(SameTy && "Invalid types!"); if (Depth == RecursionMaxDepth) { @@ -615,10 +664,19 @@ newTreeEntry(VL, false); return; } + unsigned Opcode = getSameOpcode(VL); + + // Check that this shuffle vector refers to the alternate + // sequence of opcodes. + if (Opcode == Instruction::ShuffleVector) { + Instruction *I0 = dyn_cast(VL[0]); + unsigned Op = I0->getOpcode(); + if (Op != Instruction::ShuffleVector) + isAltShuffle = true; + } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || - !getSameOpcode(VL)) { + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -754,8 +812,6 @@ DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned Opcode = getSameOpcode(VL); - // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { Instruction *Last = getLastInstruction(VL); @@ -1057,6 +1113,26 @@ } return; } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!isAltShuffle) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return; + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } default: newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); @@ -1080,11 +1156,9 @@ } return getGatherCost(E->Scalars); } - - assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && - "Invalid VL"); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast(VL[0]); - unsigned Opcode = VL0->getOpcode(); switch (Opcode) { case Instruction::PHI: { return 0; @@ -1242,6 +1316,34 @@ return VecCallCost - ScalarCallCost; } + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (unsigned i = 0; i < VL.size(); ++i) { + Instruction *I = cast(VL[i]); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + // TODO: Improve the cost model. Currently getShuffleCost returns + // cost as 2.This needs to be replaced with something like getAddSubCost. + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } default: llvm_unreachable("Unknown instruction"); } @@ -1522,9 +1624,7 @@ setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { @@ -1797,6 +1897,49 @@ E->VectorizedValue = V; return V; } + case Instruction::ShuffleVector: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); + } + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + // Create a vector of LHS op1 RHS + BinaryOperator *BinOp0 = cast(VL0); + Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + + // Create a vector of LHS op2 RHS + Instruction *VL1 = cast(E->Scalars[1]); + BinaryOperator *BinOp1 = cast(VL1); + Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + + // Create appropriate shuffle to take alternative operations from + // the vector. + std::vector Mask(E->Scalars.size()); + unsigned e = E->Scalars.size(); + for (unsigned i = 0; i < e; ++i) { + if (i & 1) + Mask[i] = Builder.getInt32(e + i); + else + Mask[i] = Builder.getInt32(i); + } + + Value *ShuffleMask = ConstantVector::get(Mask); + + Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + E->VectorizedValue = V; + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } default: llvm_unreachable("unknown inst"); } @@ -1865,7 +2008,6 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. if (Entry->NeedToGather) continue; @@ -2049,7 +2191,6 @@ for (po_iterator it = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); it != e; ++it) { BasicBlock *BB = *it; - // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; Index: test/Transforms/SLPVectorizer/X86/addsub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/addsub.ll +++ test/Transforms/SLPVectorizer/X86/addsub.ll @@ -0,0 +1,193 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common global [4 x i32] zeroinitializer, align 16 +@c = common global [4 x i32] zeroinitializer, align 16 +@d = common global [4 x i32] zeroinitializer, align 16 +@e = common global [4 x i32] zeroinitializer, align 16 +@a = common global [4 x i32] zeroinitializer, align 16 +@fb = common global [4 x float] zeroinitializer, align 16 +@fc = common global [4 x float] zeroinitializer, align 16 +@fa = common global [4 x float] zeroinitializer, align 16 + +; CHECK-LABEL: @addsub +; CHECK: %4 = add <4 x i32> %1, %0 +; CHECK: %5 = add <4 x i32> %4, %2 +; CHECK: %6 = sub <4 x i32> %4, %2 +; CHECK: %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> +; CHECK: %8 = add <4 x i32> %7, %3 +; CHECK: %9 = sub <4 x i32> %7, %3 +; CHECK: %10 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> + +; Function Attrs: nounwind uwtable +define void @addsub() #0 { +entry: + %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 0), align 16, !tbaa !1 + %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 0), align 16, !tbaa !1 + %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 0), align 16, !tbaa !1 + %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 0), align 16, !tbaa !1 + %add1 = add i32 %1, %0 + %add = add i32 %add1, %2 + %add2 = add i32 %add, %3 + store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 0), align 16, !tbaa !1 + %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 1), align 4, !tbaa !1 + %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 1), align 4, !tbaa !1 + %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 1), align 4, !tbaa !1 + %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 1), align 4, !tbaa !1 + %add4.neg = add i32 %5, %4 + %add3 = sub i32 %add4.neg, %6 + %sub = sub i32 %add3, %7 + store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 1), align 4, !tbaa !1 + %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 2), align 8, !tbaa !1 + %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 2), align 8, !tbaa !1 + %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 2), align 8, !tbaa !1 + %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 2), align 8, !tbaa !1 + %add6 = add i32 %9, %8 + %add5 = add i32 %add6, %10 + %add7 = add i32 %add5, %11 + store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 2), align 8, !tbaa !1 + %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 3), align 4, !tbaa !1 + %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 3), align 4, !tbaa !1 + %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 3), align 4, !tbaa !1 + %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 3), align 4, !tbaa !1 + %add9.neg = add i32 %13, %12 + %add8 = sub i32 %add9.neg, %14 + %sub10 = sub i32 %add8, %15 + store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 3), align 4, !tbaa !1 + ret void +} + +; CHECK-LABEL: @subadd +; CHECK: %4 = add <4 x i32> +; CHECK: %5 = sub <4 x i32> %4, %2 +; CHECK: %6 = add <4 x i32> %4, %2 +; CHECK: %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> +; CHECK: %8 = sub <4 x i32> %7, %3 +; CHECK: %9 = add <4 x i32> %7, %3 +; CHECK: %10 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> + +; Function Attrs: nounwind uwtable +define void @subadd() #0 { +entry: + %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 0), align 16, !tbaa !1 + %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 0), align 16, !tbaa !1 + %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 0), align 16, !tbaa !1 + %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 0), align 16, !tbaa !1 + %add1.neg = add i32 %1, %0 + %add = sub i32 %add1.neg, %2 + %sub = sub i32 %add, %3 + store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 0), align 16, !tbaa !1 + %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 1), align 4, !tbaa !1 + %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 1), align 4, !tbaa !1 + %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 1), align 4, !tbaa !1 + %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 1), align 4, !tbaa !1 + %add3 = add i32 %5, %4 + %add2 = add i32 %add3, %6 + %add4 = add i32 %add2, %7 + store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 1), align 4, !tbaa !1 + %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 2), align 8, !tbaa !1 + %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 2), align 8, !tbaa !1 + %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 2), align 8, !tbaa !1 + %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 2), align 8, !tbaa !1 + %add6.neg = add i32 %9, %8 + %add5 = sub i32 %add6.neg, %10 + %sub7 = sub i32 %add5, %11 + store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 2), align 8, !tbaa !1 + %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i64 0, i64 3), align 4, !tbaa !1 + %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i64 0, i64 3), align 4, !tbaa !1 + %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i64 0, i64 3), align 4, !tbaa !1 + %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i64 0, i64 3), align 4, !tbaa !1 + %add9 = add i32 %13, %12 + %add8 = add i32 %add9, %14 + %add10 = add i32 %add8, %15 + store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 3), align 4, !tbaa !1 + ret void +} + +; CHECK-LABEL: @faddfsub +; CHECK: %2 = fadd <4 x float> %0, %1 +; CHECK: %3 = fsub <4 x float> %0, %1 +; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> +; Function Attrs: nounwind uwtable +define void @faddfsub() #0 { +entry: + %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5 + %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5 + %add = fadd float %0, %1 + store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5 + %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5 + %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5 + %sub = fsub float %2, %3 + store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5 + %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5 + %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5 + %add1 = fadd float %4, %5 + store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5 + %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5 + %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5 + %sub2 = fsub float %6, %7 + store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5 + ret void +} + +; CHECK-LABEL: @fsubfadd +; CHECK: %2 = fsub <4 x float> %0, %1 +; CHECK: %3 = fadd <4 x float> %0, %1 +; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> +; Function Attrs: nounwind uwtable +define void @fsubfadd() #0 { +entry: + %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5 + %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5 + %sub = fsub float %0, %1 + store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5 + %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5 + %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5 + %add = fadd float %2, %3 + store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5 + %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5 + %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5 + %sub1 = fsub float %4, %5 + store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5 + %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5 + %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5 + %add2 = fadd float %6, %7 + store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5 + ret void +} + +; CHECK-LABEL: @No_faddfsub +; CHECK-NOT: fadd <4 x float> +; CHECK-NOT: fsub <4 x float> +; CHECK-NOT: shufflevector +; Function Attrs: nounwind uwtable +define void @No_faddfsub() #0 { +entry: + %0 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 0), align 16, !tbaa !5 + %1 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 0), align 16, !tbaa !5 + %add = fadd float %0, %1 + store float %add, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 0), align 16, !tbaa !5 + %2 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 1), align 4, !tbaa !5 + %3 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 1), align 4, !tbaa !5 + %add1 = fadd float %2, %3 + store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 1), align 4, !tbaa !5 + %4 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 2), align 8, !tbaa !5 + %5 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 2), align 8, !tbaa !5 + %add2 = fadd float %4, %5 + store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 2), align 8, !tbaa !5 + %6 = load float* getelementptr inbounds ([4 x float]* @fb, i64 0, i64 3), align 4, !tbaa !5 + %7 = load float* getelementptr inbounds ([4 x float]* @fc, i64 0, i64 3), align 4, !tbaa !5 + %sub = fsub float %6, %7 + store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i64 0, i64 3), align 4, !tbaa !5 + ret void +} + +attributes #0 = { nounwind} + +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"int", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} +!5 = metadata !{metadata !6, metadata !6, i64 0} +!6 = metadata !{metadata !"float", metadata !3, i64 0}