Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -28,6 +28,7 @@ class TargetTransformInfo; class Type; class Value; +class VectorVariant; namespace Intrinsic { enum ID : unsigned; @@ -176,6 +177,10 @@ /// elements, it will be padded with undefs. Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef Vecs); +/// \brief Determine the characteristic type of the vector function as specified +/// according to the vector function ABI. +Type* calcCharacteristicType(Function& F, VectorVariant& Variant); + } // llvm namespace #endif Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1730,6 +1730,10 @@ TLI->isFunctionVectorizable(Call->getCalledFunction()->getName())) continue; + if (Call && + Call->getCalledFunction()->hasFnAttribute("vector-variants")) + continue; + auto *Ld = dyn_cast(&I); if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { recordAnalysis("NonSimpleLoad", Ld) Index: lib/Analysis/VectorUtils.cpp =================================================================== --- lib/Analysis/VectorUtils.cpp +++ lib/Analysis/VectorUtils.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorVariant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" @@ -574,3 +575,47 @@ return ResList[0]; } + +Type *llvm::calcCharacteristicType(Function &F, VectorVariant &Variant) { + Type *ReturnType = F.getReturnType(); + Type *CharacteristicDataType = nullptr; + + if (!ReturnType->isVoidTy()) + CharacteristicDataType = ReturnType; + + if (!CharacteristicDataType) { + + std::vector &ParmKinds = Variant.getParameters(); + Function::const_arg_iterator ArgIt = F.arg_begin(); + Function::const_arg_iterator ArgEnd = F.arg_end(); + std::vector::iterator VKIt = ParmKinds.begin(); + + for (; ArgIt != ArgEnd; ++ArgIt, ++VKIt) { + if (VKIt->isVector()) { + CharacteristicDataType = (*ArgIt).getType(); + break; + } + } + } + + // TODO except Clang's ComplexType + if (!CharacteristicDataType || CharacteristicDataType->isStructTy()) { + CharacteristicDataType = Type::getInt32Ty(F.getContext()); + } + + // Legalize the characteristic type based on target requirements. + CharacteristicDataType = + Variant.promoteToSupportedType(CharacteristicDataType); + + if (CharacteristicDataType->isPointerTy()) { + // For such cases as 'int* foo(int x)', where x is a non-vector type, the + // characteristic type at this point will be i32*. If we use the DataLayout + // to query the supported pointer size, then a promotion to i64* is + // incorrect because the mask element type will mismatch the element type + // of the characteristic type. + PointerType *PointerTy = cast(CharacteristicDataType); + CharacteristicDataType = PointerTy->getElementType(); + } + + return CharacteristicDataType; +} Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -83,6 +83,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/VectorVariant.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -3857,7 +3858,8 @@ // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) + if ((!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) && + !CI->getCalledFunction()->hasFnAttribute("vector-variants")) return Cost; // If the corresponding vector cost is cheaper, return its cost. @@ -4692,6 +4694,58 @@ return !CInt || CInt->isZero(); } +static VectorVariant* matchVectorVariant(Function *CalledFunc, unsigned VF, + bool IsMasked, + const TargetTransformInfo *TTI) { + + DEBUG(dbgs() << "\nCall VF: " << VF << "\n"); + unsigned TargetMaxRegWidth = TTI->getRegisterBitWidth(true); + DEBUG(dbgs() << "Target Max Register Width: " << TargetMaxRegWidth << "\n"); + + TargetTransformInfo::ISAClass TargetIsaClass = + TTI->getISAClassForMaxVecRegSize(); + DEBUG(dbgs() << "Target ISA Class: " + << TTI->ISAClassToString(TargetIsaClass) << "\n\n"); + + Attribute Attr = CalledFunc->getFnAttribute("vector-variants"); + StringRef VariantsStr = Attr.getValueAsString(); + SmallVector Variants; + VariantsStr.split(Variants, ","); + for (unsigned i = 0; i < Variants.size(); i++) { + VectorVariant *Variant = new VectorVariant(Variants[i], TTI); + TargetTransformInfo::ISAClass VariantIsaClass = Variant->getISA(); + DEBUG(dbgs() << "Variant ISA Class: " + << TTI->ISAClassToString(VariantIsaClass) << "\n"); + unsigned IsaClassMaxRegWidth = + TTI->ISAClassMaxRegisterWidth(VariantIsaClass); + DEBUG(dbgs() << "Isa Class Max Vector Register Width: " + << IsaClassMaxRegWidth << "\n"); + unsigned FuncVF = Variant->getVlen(); + DEBUG(dbgs() << "Func VF: " << FuncVF << "\n\n"); + + // Pick candidate functions based on the target ISA class, masked property, + // and loop VF == Variant VF. For now, matching is limited to exact matching + // on VF. If no match exits based on these criteria, the calls will be + // scalarized. This could be extended in the future for when: + // + // 1) the only available simd function variants have a VF that is less than + // the loop VF. In this case, multiple calls can be made to the simd + // function. Currently, however, LV only keeps a 1-1 scalar -> vector + // Value mapping. + // + // 2) the only available simd function variants have a VF that is greater + // than the loop VF. In this case, we can make the call to the simd + // function and effectively mask off the unused vector parts. + // + if (VariantIsaClass == TargetIsaClass && Variant->isMasked() == IsMasked && + VF == Variant->getVlen()) { + return Variant; + } + } + + return nullptr; +} + void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: @@ -4895,13 +4949,42 @@ Module *M = I.getParent()->getParent()->getParent(); auto *CI = cast(&I); - StringRef FnName = CI->getCalledFunction()->getName(); Function *F = CI->getCalledFunction(); + + // Find the appropriate simd function match and generate the mask if this + // is a masked simd function. + bool UseSimdFunction = F->hasFnAttribute("vector-variants") ? true : false; + std::vector SimdFuncParms; + VectorVariant *SimdVariant = nullptr; + bool IsMasked = Legal->isMaskRequired(&I); + Type *CharacteristicTy = nullptr; + VectorParts Mask; + if (UseSimdFunction) { + SimdVariant = matchVectorVariant(F, VF, IsMasked, TTI); + DEBUG(dbgs() << "Matched Variant: " << SimdVariant->encode() << "\n"); + SimdFuncParms = SimdVariant->getParameters(); + CharacteristicTy = calcCharacteristicType(*F, *SimdVariant); + if (IsMasked) { + Mask = createBlockInMask(CI->getParent()); + } + } + Type *RetTy = ToVectorTy(CI->getType(), VF); SmallVector Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { + Value *ArgOperand = CI->getArgOperand(i); + if (!UseSimdFunction || SimdFuncParms[i].isVector()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + else + Tys.push_back(ArgOperand->getType()); + } + + // Masked simd functions need an extra mask parameter, so add its type to + // the Tys list. + if (UseSimdFunction && IsMasked) { + Tys.push_back(ToVectorTy(CharacteristicTy, VF)); + } Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4920,12 +5003,41 @@ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { Value *Arg = CI->getArgOperand(i); // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) + // vector. Likewise, linear and uniform parameters for simd functions + // are passed as scalars, so don't vectorize those either. + bool IsScalarSimdArg = UseSimdFunction && + (SimdFuncParms[i].isLinear() || SimdFuncParms[i].isUniform()); + if (IsScalarSimdArg) + Arg = getOrCreateScalarValue(Arg, {Part, 0}); + else if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); Args.push_back(Arg); } + // Promote mask of mask to and add + // it to the Args list. + if (UseSimdFunction && IsMasked) { + unsigned CharacteristicTySize = + CharacteristicTy->getPrimitiveSizeInBits(); + // Mask is vector of i1. Promote it to an integer type that has the + // same size as the characteristic type. + Type *ScalarToType = IntegerType::get(CharacteristicTy->getContext(), + CharacteristicTySize); + VectorType *VecToType = VectorType::get(ScalarToType, VF); + Value *MaskExt = Builder.CreateSExt(Mask[Part], VecToType, "mask.ext"); + + // Bitcast if the promoted type is not the same as the characteristic + // type. + if (ScalarToType != CharacteristicTy) { + Type *MaskCastTy = VectorType::get(CharacteristicTy, VF); + Value *MaskCast = Builder.CreateBitCast(MaskExt, MaskCastTy, + "mask.cast"); + Args.push_back(MaskCast); + } else { + Args.push_back(MaskExt); + } + } + Function *VectorF; if (UseVectorIntrinsic) { // Use vector version of the intrinsic. @@ -4934,8 +5046,13 @@ TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); } else { - // Use vector version of the library call. - StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + // Use vector version of the library call or a simd function. + StringRef VFnName; + if (UseSimdFunction) { + std::string VariantName = SimdVariant->encode() + FnName.str(); + VFnName = VariantName; + } else + VFnName = TLI->getVectorizedFunction(FnName, VF); assert(!VFnName.empty() && "Vector function name is empty."); VectorF = M->getFunction(VFnName); if (!VectorF) { @@ -4959,6 +5076,13 @@ addMetadata(V, &I); } + if (UseSimdFunction) { + // Remove simd function attributes from the original function just + // in case VecClone runs again. + F->removeFnAttr("vector-variants"); + delete SimdVariant; + } + break; } @@ -5357,7 +5481,8 @@ if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && !isa(CI) && !(CI->getCalledFunction() && TLI && - TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { + TLI->isFunctionVectorizable(CI->getCalledFunction()->getName())) && + !(CI->getCalledFunction()->hasFnAttribute("vector-variants"))) { ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); @@ -5885,6 +6010,12 @@ if (C->canTrap()) return false; } + // We will need a mask for masked SIMD functions. + auto *CI = dyn_cast(&I); + if (CI && CI->getCalledFunction()->hasFnAttribute("vector-variants")) { + MaskedOp.insert(CI); + continue; + } // We might be able to hoist the load. if (I.mayReadFromMemory()) { auto *LI = dyn_cast(&I); @@ -8150,6 +8281,17 @@ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + + // If a match for the simd function was not found, then just scalarize the + // calls to the original function. + Function *F = CI->getCalledFunction(); + if (F->hasFnAttribute("vector-variants")) { + bool IsMasked = Legal->isMaskRequired(I); + VectorVariant *SimdVariant = matchVectorVariant(F, VF, IsMasked, TTI); + if (!SimdVariant) + return false; + } + return UseVectorIntrinsic || !NeedToScalarize; } if (isa(I) || isa(I)) { Index: test/Transforms/LoopVectorize/masked_simd_func.ll =================================================================== --- test/Transforms/LoopVectorize/masked_simd_func.ll +++ test/Transforms/LoopVectorize/masked_simd_func.ll @@ -0,0 +1,107 @@ +; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone. + +; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s + +; CHECK: call <8 x i32> @_ZGVdM8vlu_dowork + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +; Function Attrs: noinline nounwind uwtable +define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 { +entry: + %add = add nsw i32 %b, %k + %add1 = add nsw i32 %add, %c + ret i32 %add1 +} + +; Function Attrs: noinline nounwind uwtable +define i32 @main() local_unnamed_addr #1 { +entry: + %a = alloca [4096 x i32], align 16 + %b = alloca [4096 x i32], align 16 + %0 = bitcast [4096 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5 + %1 = bitcast [4096 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv39 = phi i64 [ 0, %entry ], [ %indvars.iv.next40, %for.body ] + %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv39 + %2 = trunc i64 %indvars.iv39 to i32 + store i32 %2, i32* %arrayidx, align 4, !tbaa !2 + %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1 + %exitcond41 = icmp eq i64 %indvars.iv.next40, 4096 + br i1 %exitcond41, label %for.end, label %for.body + +for.end: ; preds = %for.body + %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3 + %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.inc, %for.end + %indvars.iv36 = phi i64 [ 0, %for.end ], [ %indvars.iv.next37, %omp.inner.for.inc ] + %4 = trunc i64 %indvars.iv36 to i32 + %rem = and i32 %4, 1 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %omp.inner.for.inc, label %if.then + +if.then: ; preds = %omp.inner.for.body + %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv36 + %5 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %call = tail call i32 @dowork(i32 %5, i32 %4, i32 %3), !llvm.mem.parallel_loop_access !6 + %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv36 + store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body, %if.then + %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1 + %exitcond38 = icmp eq i64 %indvars.iv.next37, 4096 + br i1 %exitcond38, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6 + +omp.inner.for.end: ; preds = %omp.inner.for.inc + br label %for.body11 + +for.body11: ; preds = %for.body11, %omp.inner.for.end + %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ] + %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv + %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2 + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.end17, label %for.body11 + +for.end17: ; preds = %for.body11 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare i32 @printf(i8*, ...) #3 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN16vlu_dowork,_ZGVbM4vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM16vlu_dowork" } +attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.vectorize.enable", i1 true} Index: test/Transforms/LoopVectorize/simd_func.ll =================================================================== --- test/Transforms/LoopVectorize/simd_func.ll +++ test/Transforms/LoopVectorize/simd_func.ll @@ -0,0 +1,99 @@ +; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone. + +; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s + +; CHECK: call <8 x i32> @_ZGVdN8vlu_dowork + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +; Function Attrs: noinline nounwind uwtable +define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 { +entry: + %add = add nsw i32 %b, %k + %add1 = add nsw i32 %add, %c + ret i32 %add1 +} + +; Function Attrs: noinline nounwind uwtable +define i32 @main() local_unnamed_addr #1 { +entry: + %a = alloca [4096 x i32], align 16 + %b = alloca [4096 x i32], align 16 + %0 = bitcast [4096 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5 + %1 = bitcast [4096 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv38 = phi i64 [ 0, %entry ], [ %indvars.iv.next39, %for.body ] + %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv38 + %2 = trunc i64 %indvars.iv38 to i32 + store i32 %2, i32* %arrayidx, align 4, !tbaa !2 + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond40 = icmp eq i64 %indvars.iv.next39, 4096 + br i1 %exitcond40, label %for.end, label %for.body + +for.end: ; preds = %for.body + %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3 + %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.body, %for.end + %indvars.iv35 = phi i64 [ 0, %for.end ], [ %indvars.iv.next36, %omp.inner.for.body ] + %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv35 + %4 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %5 = trunc i64 %indvars.iv35 to i32 + %call = tail call i32 @dowork(i32 %4, i32 %5, i32 %3), !llvm.mem.parallel_loop_access !6 + %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv35 + store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond37 = icmp eq i64 %indvars.iv.next36, 4096 + br i1 %exitcond37, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6 + +omp.inner.for.end: ; preds = %omp.inner.for.body + br label %for.body11 + +for.body11: ; preds = %for.body11, %omp.inner.for.end + %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ] + %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv + %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2 + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.end17, label %for.body11 + +for.end17: ; preds = %for.body11 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare i32 @printf(i8*, ...) #3 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN16vlu_dowork,_ZGVbM4vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM16vlu_dowork" } +attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.vectorize.enable", i1 true} Index: test/Transforms/LoopVectorize/simd_func_scalar.ll =================================================================== --- test/Transforms/LoopVectorize/simd_func_scalar.ll +++ test/Transforms/LoopVectorize/simd_func_scalar.ll @@ -0,0 +1,111 @@ +; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone. + +; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s + +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +; Function Attrs: noinline nounwind uwtable +define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 { +entry: + %add = add nsw i32 %b, %k + %add1 = add nsw i32 %add, %c + ret i32 %add1 +} + +; Function Attrs: noinline nounwind uwtable +define i32 @main() local_unnamed_addr #1 { +entry: + %a = alloca [4096 x i32], align 16 + %b = alloca [4096 x i32], align 16 + %0 = bitcast [4096 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5 + %1 = bitcast [4096 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv38 = phi i64 [ 0, %entry ], [ %indvars.iv.next39, %for.body ] + %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv38 + %2 = trunc i64 %indvars.iv38 to i32 + store i32 %2, i32* %arrayidx, align 4, !tbaa !2 + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond40 = icmp eq i64 %indvars.iv.next39, 4096 + br i1 %exitcond40, label %for.end, label %for.body + +for.end: ; preds = %for.body + %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3 + %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.body, %for.end + %indvars.iv35 = phi i64 [ 0, %for.end ], [ %indvars.iv.next36, %omp.inner.for.body ] + %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv35 + %4 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %5 = trunc i64 %indvars.iv35 to i32 + %call = tail call i32 @dowork(i32 %4, i32 %5, i32 %3), !llvm.mem.parallel_loop_access !6 + %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv35 + store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond37 = icmp eq i64 %indvars.iv.next36, 4096 + br i1 %exitcond37, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6 + +omp.inner.for.end: ; preds = %omp.inner.for.body + br label %for.body11 + +for.body11: ; preds = %for.body11, %omp.inner.for.end + %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ] + %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv + %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2 + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.end17, label %for.body11 + +for.end17: ; preds = %for.body11 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare i32 @printf(i8*, ...) #3 + +attributes #0 = { noinline norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN8vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN8vlu_dowork,_ZGVbM8vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM8vlu_dowork" } +attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 4} +!8 = !{!"llvm.loop.vectorize.enable", i1 true}