Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -465,7 +465,8 @@ /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions(); + void collectTriviallyDeadInstructions( + SmallPtrSetImpl &DeadInstructions); /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. @@ -479,8 +480,9 @@ /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// A helper function to vectorize a single BB within the innermost loop. - void vectorizeBlockInLoop(BasicBlock *BB); + /// A helper function to vectorize a single instruction within the innermost + /// loop. + void vectorizeInstruction(Instruction &I); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and @@ -798,14 +800,6 @@ // Record whether runtime checks are added. bool AddedSafetyChecks; - // Holds instructions from the original loop whose counterparts in the - // vectorized loop would be trivially dead if generated. For example, - // original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet DeadInstructions; - // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; @@ -3892,19 +3886,26 @@ // //===------------------------------------------------===// - // Collect instructions from the original loop that will become trivially - // dead in the vectorized loop. We don't need to vectorize these - // instructions. - collectTriviallyDeadInstructions(); + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); // Scan the loop in a topological order to ensure that defs are vectorized // before users. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); - // Vectorize all of the blocks in the original loop. + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - vectorizeBlockInLoop(BB); + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + vectorizeInstruction(I); // Insert truncates and extends for any truncated instructions as hints to // InstCombine. @@ -4334,7 +4335,8 @@ } } -void InnerLoopVectorizer::collectTriviallyDeadInstructions() { +void InnerLoopVectorizer::collectTriviallyDeadInstructions( + SmallPtrSetImpl &DeadInstructions) { BasicBlock *Latch = OrigLoop->getLoopLatch(); // We create new control-flow for the vectorized loop, so the original @@ -4720,333 +4722,323 @@ return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB) { - // For each instruction in the old loop. - for (Instruction &I : *BB) { - - // If the instruction will become trivially dead when vectorized, we don't - // need to generate it. - if (DeadInstructions.count(&I)) - continue; +void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { + // Scalarize instructions that should remain scalar after vectorization. + if (VF > 1 && + !(isa(&I) || isa(&I) || isa(&I)) && + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); + return; + } - // Scalarize instructions that should remain scalar after vectorization. - if (VF > 1 && - !(isa(&I) || isa(&I) || - isa(&I)) && - shouldScalarizeInstruction(&I)) { - scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); - continue; - } + switch (I.getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + break; + case Instruction::PHI: { + // Vectorize PHINodes. + widenPHIInstruction(&I, UF, VF); + break; + } // End of PHI. + case Instruction::GetElementPtr: { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + auto *GEP = cast(&I); + VectorParts Entry(UF); - switch (I.getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI: { - // Vectorize PHINodes. - widenPHIInstruction(&I, UF, VF); - continue; - } // End of PHI. - case Instruction::GetElementPtr: { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - auto *GEP = cast(&I); - VectorParts Entry(UF); + if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateVectorSplat(VF, Clone); + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { - if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = Builder.CreateVectorSplat(VF, Clone); - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getVectorValue(GEP->getPointerOperand())[Part]; - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector Indices; - for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { - if (OrigLoop->isLoopInvariant(U.get())) - Indices.push_back(U.get()); - else - Indices.push_back(getVectorValue(U.get())[Part]); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = GEP->isInBounds() - ? Builder.CreateInBoundsGEP(Ptr, Indices) - : Builder.CreateGEP(Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - Entry[Part] = NewGEP; + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand()) + ? GEP->getPointerOperand() + : getVectorValue(GEP->getPointerOperand())[Part]; + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector Indices; + for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { + if (OrigLoop->isLoopInvariant(U.get())) + Indices.push_back(U.get()); + else + Indices.push_back(getVectorValue(U.get())[Part]); } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = GEP->isInBounds() + ? Builder.CreateInBoundsGEP(Ptr, Indices) + : Builder.CreateGEP(Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + Entry[Part] = NewGEP; } + } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, GEP); + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, GEP); + break; + } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (Legal->isScalarWithPredication(&I)) { + scalarizeInstruction(&I, true); break; } - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - // Scalarize with predication if this instruction may divide by zero and - // block execution is conditional, otherwise fallthrough. - if (Legal->isScalarWithPredication(&I)) { - scalarizeInstruction(&I, true); - continue; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen binops. - auto *BinOp = cast(&I); - setDebugLocFromInst(Builder, BinOp); - const VectorParts &A = getVectorValue(BinOp->getOperand(0)); - const VectorParts &B = getVectorValue(BinOp->getOperand(1)); - - // Use this vector value for all users of the original instruction. - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + auto *BinOp = cast(&I); + setDebugLocFromInst(Builder, BinOp); + const VectorParts &A = getVectorValue(BinOp->getOperand(0)); + const VectorParts &B = getVectorValue(BinOp->getOperand(1)); - if (BinaryOperator *VecOp = dyn_cast(V)) - VecOp->copyIRFlags(BinOp); + // Use this vector value for all users of the original instruction. + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); - Entry[Part] = V; - } + if (BinaryOperator *VecOp = dyn_cast(V)) + VecOp->copyIRFlags(BinOp); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, BinOp); - break; + Entry[Part] = V; } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - auto *SE = PSE.getSE(); - bool InvariantCond = - SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, &I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - const VectorParts &Cond = getVectorValue(I.getOperand(0)); - const VectorParts &Op0 = getVectorValue(I.getOperand(1)); - const VectorParts &Op1 = getVectorValue(I.getOperand(2)); - auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, BinOp); + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + auto *SE = PSE.getSE(); + bool InvariantCond = + SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, &I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + const VectorParts &Cond = getVectorValue(I.getOperand(0)); + const VectorParts &Op0 = getVectorValue(I.getOperand(1)); + const VectorParts &Op1 = getVectorValue(I.getOperand(2)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part] = Builder.CreateSelect( - InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); - } + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); - break; + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part] = Builder.CreateSelect( + InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = dyn_cast(&I); - setDebugLocFromInst(Builder, Cmp); - const VectorParts &A = getVectorValue(Cmp->getOperand(0)); - const VectorParts &B = getVectorValue(Cmp->getOperand(1)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *C = nullptr; - if (FCmp) { - C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - cast(C)->copyFastMathFlags(Cmp); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); - } - Entry[Part] = C; - } + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); - break; + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = dyn_cast(&I); + setDebugLocFromInst(Builder, Cmp); + const VectorParts &A = getVectorValue(Cmp->getOperand(0)); + const VectorParts &B = getVectorValue(Cmp->getOperand(1)); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *C = nullptr; + if (FCmp) { + C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); + cast(C)->copyFastMathFlags(Cmp); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + } + Entry[Part] = C; } - case Instruction::Store: - case Instruction::Load: - vectorizeMemoryInstruction(&I); - break; - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = dyn_cast(&I); - setDebugLocFromInst(Builder, CI); - - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - if (Cost->isOptimizableIVTruncate(CI, VF)) { - widenIntOrFpInduction(cast(CI->getOperand(0)), - cast(CI)); - break; - } + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } - /// Vectorize casts. - Type *DestTy = - (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); + case Instruction::Store: + case Instruction::Load: + vectorizeMemoryInstruction(&I); + break; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = dyn_cast(&I); + setDebugLocFromInst(Builder, CI); - const VectorParts &A = getVectorValue(CI->getOperand(0)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + if (Cost->isOptimizableIVTruncate(CI, VF)) { + widenIntOrFpInduction(cast(CI->getOperand(0)), + cast(CI)); break; } - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa(I)) - break; - setDebugLocFromInst(Builder, &I); + /// Vectorize casts. + Type *DestTy = + (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); - Module *M = BB->getParent()->getParent(); - auto *CI = cast(&I); + const VectorParts &A = getVectorValue(CI->getOperand(0)); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } - StringRef FnName = CI->getCalledFunction()->getName(); - Function *F = CI->getCalledFunction(); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(&I); - break; - } - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); - bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(&I); - break; - } + case Instruction::Call: { + // Ignore dbg intrinsics. + if (isa(I)) + break; + setDebugLocFromInst(Builder, &I); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { - const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); - Arg = VectorArg[Part]; - } - Args.push_back(Arg); - } + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast(&I); - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the library call. - StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); - assert(!VFnName.empty() && "Vector function name is empty."); - VectorF = M->getFunction(VFnName); - if (!VectorF) { - // Generate a declaration - FunctionType *FTy = FunctionType::get(RetTy, Tys, false); - VectorF = - Function::Create(FTy, Function::ExternalLinkage, VFnName, M); - VectorF->copyAttributesFrom(F); - } + StringRef FnName = CI->getCalledFunction()->getName(); + Function *F = CI->getCalledFunction(); + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { + scalarizeInstruction(&I); + break; + } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(&I); + break; + } + + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { + const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + Arg = VectorArg[Part]; } - assert(VectorF && "Can't create vector function."); + Args.push_back(Arg); + } - SmallVector OpBundles; - CI->getOperandBundlesAsDefs(OpBundles); - CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the library call. + StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + assert(!VFnName.empty() && "Vector function name is empty."); + VectorF = M->getFunction(VFnName); + if (!VectorF) { + // Generate a declaration + FunctionType *FTy = FunctionType::get(RetTy, Tys, false); + VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFnName, M); + VectorF->copyAttributesFrom(F); + } + } + assert(VectorF && "Can't create vector function."); - if (isa(V)) - V->copyFastMathFlags(CI); + SmallVector OpBundles; + CI->getOperandBundlesAsDefs(OpBundles); + CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); - Entry[Part] = V; - } + if (isa(V)) + V->copyFastMathFlags(CI); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); - break; + Entry[Part] = V; } - default: - // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(&I); - break; - } // end of switch. - } // end of for_each instr. + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } + + default: + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(&I); + break; + } // end of switch. } void InnerLoopVectorizer::updateAnalysis() {