Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -740,6 +740,27 @@ /// vector of instructions. void addMetadata(ArrayRef To, Instruction *From); + /// Check legality of given SVML call instruction \p VecCall generated for + /// scalar call \p Call. If illegal then the appropriate legal instruction + /// is returned. + Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call); + + /// Returns the legal VF for a call instruction \p CI using TTI information + /// and vector type. + unsigned getLegalVFForCall(CallInst *CI); + + /// Partially vectorize a given call \p Call by breaking it down into multiple + /// calls of \p LegalCall, decided by the variant VF \p LegalVF. + Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall, + unsigned LegalVF); + + /// Generate shufflevector instruction for a vector value \p V based on the + /// current \p Part and a smaller VF \p LegalVF. + Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part); + + /// Combine partially vectorized calls stored in \p CallResults. + Value *combinePartialVecCalls(SmallVectorImpl &CallResults); + /// The original loop. Loop *OrigLoop; @@ -4676,11 +4697,255 @@ if (isa(V)) V->copyFastMathFlags(CI); - VectorLoopValueMap.setVectorValue(&I, Part, V); - addMetadata(V, &I); + // Perform legalization of SVML call instruction only if original call + // was not Intrinsic + if (!isa(CI) && + (V->getCalledFunction()->getName()).startswith("__svml")) { + LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump()); + auto *LegalV = cast(legalizeSVMLCall(V, CI)); + LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: "; + LegalV->dump()); + VectorLoopValueMap.setVectorValue(&I, Part, LegalV); + addMetadata(LegalV, &I); + } else { + VectorLoopValueMap.setVectorValue(&I, Part, V); + addMetadata(V, &I); + } } } +//===----------------------------------------------------------------------===// +// Implementation of functions for SVML vector call legalization. +//===----------------------------------------------------------------------===// +// +// Unlike other VECLIBs, SVML needs to be used with target-legal +// vector types. Otherwise, link failures and/or runtime failures +// will occur. A motivating example could be - +// +// double *a; +// float *b; +// #pragma clang loop vectorize_width(8) +// for(i = 0; i < N; ++i) { +// a[i] = sin(i); // Legal SVML VF must be 4 or below on AVX +// b[i] = cosf(i); // VF can be 8 on AVX since 8 floats can fit in YMM +// } +// +// Current implementation of vector code generation in LV is +// driven based on a single VF (in InnerLoopVectorizer::VF). This +// inhibits the flexibility of adjusting/choosing different VF +// for different instructions. +// +// Due to this limitation it is much more straightforward to +// first generate the illegal sin8 (svml_sin8 for SVML vector +// library) call and then legalize it than trying to avoid +// generating illegal code from the beginning. +// +// A solution for this problem is to check legality of the +// call instruction right after generating it in vectorizer and +// if it is illegal we split the call arguments and issue multiple +// calls to match the legal VF. This is demonstrated currently for +// the SVML vector library calls (non-intrinsic version only). +// +// Future directions and extensions: +// 1) This legalization example shows us that a good direction +// for the VPlan framework would be to model the vector call +// instructions in a way that legal VF for each call is chosen +// correctly within vectorizer and illegal code generation is +// avoided. +// 2) This logic can also be extended to general vector functions +// i.e. legalization OpenMP decalre simd functions. The +// requirements needed for this will be documented soon. + +Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall, + CallInst *Call) { + unsigned LegalVF = getLegalVFForCall(VecCall); + + assert(LegalVF > 1 && + "Legal VF for SVML call must be greater than 1 to vectorize"); + + if (LegalVF == VF.getKnownMinValue()) + return VecCall; + else if (LegalVF > VF.getKnownMinValue()) + // TODO: handle case when we are underfilling vectors + return VecCall; + + // Legal VF for this SVML call is smaller than chosen VF, break it down into + // smaller call instructions + + // Convert args, types and return type to match legal VF + SmallVector NewTys; + SmallVector NewArgs; + Type *NewRetTy = ToVectorTy(Call->getType(), LegalVF); + + for (Value *ArgOperand : Call->arg_operands()) { + Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF); + NewTys.push_back(Ty); + NewArgs.push_back(UndefValue::get(Ty)); + } + + // Construct legal vector function + Function *F = Call->getCalledFunction(); + StringRef FnName = F->getName(); + Module *M = Call->getModule(); + StringRef LegalVFnName = TLI->getVectorizedFunction(FnName, LegalVF); + assert(!LegalVFnName.empty() && (LegalVFnName != FnName) && + "Could not find legal vector function in TLI."); + + Function *LegalVectorF = M->getFunction(LegalVFnName); + if (!LegalVectorF) { + FunctionType *LegalFTy = FunctionType::get(NewRetTy, NewTys, false); + LegalVectorF = + Function::Create(LegalFTy, Function::ExternalLinkage, LegalVFnName, M); + LegalVectorF->copyAttributesFrom(F); + } + assert(LegalVectorF && + "Module does not have vector version for legal SVML call."); + LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump()); + + SmallVector OpBundles; + Call->getOperandBundlesAsDefs(OpBundles); + CallInst *LegalV = CallInst::Create(LegalVectorF, NewArgs, OpBundles); + + if (isa(LegalV)) + LegalV->copyFastMathFlags(Call); + + LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump()); + + Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV, LegalVF); + + LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump()); + + // Remove the illegal call from Builder + VecCall->eraseFromParent(); + + if (LegalV) + delete LegalV; + + return LegalizedCall; +} + +unsigned InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) { + const DataLayout DL = CI->getModule()->getDataLayout(); + FunctionType *CallFT = CI->getFunctionType(); + // All functions that need legalization should have a vector return type. + // This is true for all SVML functions that are currently supported. + assert(isa(CallFT->getReturnType()) && + "Return type of call that needs legalization is not a vector."); + auto *VecCallRetType = cast(CallFT->getReturnType()); + Type *ElemType = VecCallRetType->getElementType(); + + unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType); + unsigned VectorBitWidth = TTI->getRegisterBitWidth(true); + unsigned LegalVF = VectorBitWidth / TypeBitWidth; + + LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n"); + LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n"); + LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth + << "\n"); + LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n"); + + return LegalVF; +} + +// Partial vectorization of a call instruction is achieved by making clones of +// \p LegalCall and overwriting its argument operands with shufflevector +// equivalent decided based on \p LegalVF and current Part being filled. +Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call, + CallInst *LegalCall, + unsigned LegalVF) { + unsigned NumParts = VF.getKnownMinValue() / LegalVF; + LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n"); + SmallVector CallResults; + + for (unsigned Part = 0; Part < NumParts; ++Part) { + auto *ClonedCall = cast(LegalCall->clone()); + + // Update the arg operand of cloned call to shufflevector + for (unsigned i = 0, ie = Call->getNumArgOperands(); i != ie; ++i) { + auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part); + ClonedCall->setArgOperand(i, NewOp); + } + + LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump()); + + auto *PartialVecCall = Builder.Insert(ClonedCall); + CallResults.push_back(PartialVecCall); + } + + return combinePartialVecCalls(CallResults); +} + +Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF, + unsigned Part) { + // Example: + // Consider the following vector code - + // %1 = sitofp <4 x i32> %0 to <4 x double> + // %2 = call <4 x double> @__svml_sin4(<4 x double> %1) + // + // If the LegalVF is 2, we partially vectorize the sin4 call by invoking + // generateShuffleValue on the operand %1 + // If Part = 1, output value is - + // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> + // and if Part = 2, output is - + // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> + + assert(isa(V->getType()) && + "Cannot generate shuffles for non-vector values."); + SmallVector ShuffleMask; + Value *Undef = UndefValue::get(V->getType()); + + unsigned ElemIdx = Part * LegalVF; + + for (unsigned K = 0; K < LegalVF; K++) + ShuffleMask.push_back(ElemIdx + K); + + auto *ShuffleInst = + Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle"); + + return ShuffleInst; +} + +// Results of the calls executed by smaller legal call instructions must be +// combined to match the original VF for later use. This is done by constructing +// shufflevector instructions in a cumulative fashion. +Value *InnerLoopVectorizer::combinePartialVecCalls( + SmallVectorImpl &CallResults) { + assert(isa(CallResults[0]->getType()) && + "Cannot combine calls with non-vector results."); + auto *CallType = cast(CallResults[0]->getType()); + + Value *CombinedShuffle; + unsigned NumElems = CallType->getNumElements() * 2; + unsigned NumRegs = CallResults.size(); + + assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) && + "Number of partial vector calls to combine must be a power of 2 " + "(atleast 2^1)"); + + while (NumRegs > 1) { + for (unsigned I = 0; I < NumRegs; I += 2) { + SmallVector ShuffleMask; + for (unsigned J = 0; J < NumElems; J++) + ShuffleMask.push_back(J); + + CombinedShuffle = Builder.CreateShuffleVector( + CallResults[I], CallResults[I + 1], ShuffleMask, "combined"); + LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:"; + CombinedShuffle->dump()); + CallResults.push_back(CombinedShuffle); + } + + SmallVector::iterator Start = CallResults.begin(); + SmallVector::iterator End = Start + NumRegs; + CallResults.erase(Start, End); + + NumElems *= 2; + NumRegs /= 2; + } + + return CombinedShuffle; +} + void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPUser &Operands, bool InvariantCond, Index: llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll +++ llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll @@ -39,7 +39,8 @@ declare double @__exp_finite(double) #0 ; CHECK-LABEL: @exp_f64 -; CHECK: <4 x double> @__svml_exp4 +; CHECK: <2 x double> @__svml_exp2 +; CHECK: <2 x double> @__svml_exp2 ; CHECK: ret define void @exp_f64(double* nocapture %varray) { entry: @@ -99,7 +100,8 @@ declare double @__log_finite(double) #0 ; CHECK-LABEL: @log_f64 -; CHECK: <4 x double> @__svml_log4 +; CHECK: <2 x double> @__svml_log2 +; CHECK: <2 x double> @__svml_log2 ; CHECK: ret define void @log_f64(double* nocapture %varray) { entry: @@ -159,7 +161,8 @@ declare double @__pow_finite(double, double) #0 ; CHECK-LABEL: @pow_f64 -; CHECK: <4 x double> @__svml_pow4 +; CHECK: <2 x double> @__svml_pow2 +; CHECK: <2 x double> @__svml_pow2 ; CHECK: ret define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { entry: @@ -219,7 +222,8 @@ define void @exp2_finite(double* nocapture %varray) { ; CHECK-LABEL: @exp2_finite( -; CHECK: call <4 x double> @__svml_exp24(<4 x double> {{.*}}) +; CHECK: call <2 x double> @__svml_exp22(<2 x double> {{.*}}) +; CHECK: call <2 x double> @__svml_exp22(<2 x double> {{.*}}) ; CHECK: ret void ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll @@ -0,0 +1,508 @@ +; Check legalization of SVML calls. Also checks that intrinsic calls are not legalizedby vectorizer. + +; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare double @sin(double) #0 +declare float @sinf(float) #0 +declare double @llvm.sin.f64(double) #0 +declare float @llvm.sin.f32(float) #0 + +declare double @cos(double) #0 +declare float @cosf(float) #0 +declare double @llvm.cos.f64(double) #0 +declare float @llvm.cos.f32(float) #0 + +declare double @pow(double, double) #0 +declare float @powf(float, float) #0 +declare double @llvm.pow.f64(double, double) #0 +declare float @llvm.pow.f32(float, float) #0 + +declare double @exp(double) #0 +declare float @expf(float) #0 +declare double @llvm.exp.f64(double) #0 +declare float @llvm.exp.f32(float) #0 + +declare double @log(double) #0 +declare float @logf(float) #0 +declare double @llvm.log.f64(double) #0 +declare float @llvm.log.f32(float) #0 + + +define void @sin_f64(double* nocapture %varray) { +; CHECK-LABEL: @sin_f64( +; CHECK: [[TMP1:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP2:%.*]]) +; CHECK: [[TMP3:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @sin(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sin_f32(float* nocapture %varray) { +; CHECK-LABEL: @sin_f32( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_sinf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @sinf(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sin_f64_intrinsic(double* nocapture %varray) { +; CHECK-LABEL: @sin_f64_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x double> @__svml_sin8(<8 x double> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.sin.f64(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sin_f32_intrinsic(float* nocapture %varray) { +; CHECK-LABEL: @sin_f32_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_sinf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.sin.f32(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f64(double* nocapture %varray) { +; CHECK-LABEL: @cos_f64( +; CHECK: [[TMP1:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP2:%.*]]) +; CHECK: [[TMP3:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @cos(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f32(float* nocapture %varray) { +; CHECK-LABEL: @cos_f32( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_cosf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @cosf(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f64_intrinsic(double* nocapture %varray) { +; CHECK-LABEL: @cos_f64_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x double> @__svml_cos8(<8 x double> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.cos.f64(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f32_intrinsic(float* nocapture %varray) { +; CHECK-LABEL: @cos_f32_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_cosf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.cos.f32(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { +; CHECK-LABEL: @pow_f64( +; CHECK: [[TMP1:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]]) +; CHECK: [[TMP4:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv + %tmp1 = load double, double* %arrayidx, align 4 + %tmp2 = tail call double @pow(double %conv, double %tmp1) + %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv + store double %tmp2, double* %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { +; CHECK-LABEL: @pow_f64_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x double> @__svml_pow8(<8 x double> [[TMP2:%.*]], <8 x double> [[TMP3:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv + %tmp1 = load double, double* %arrayidx, align 4 + %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1) + %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv + store double %tmp2, double* %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { +; CHECK-LABEL: @pow_f32( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_powf8(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv + %tmp1 = load float, float* %arrayidx, align 4 + %tmp2 = tail call float @powf(float %conv, float %tmp1) + %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv + store float %tmp2, float* %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { +; CHECK-LABEL: @pow_f32_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_powf8(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv + %tmp1 = load float, float* %arrayidx, align 4 + %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1) + %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv + store float %tmp2, float* %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f64(double* nocapture %varray) { +; CHECK-LABEL: @exp_f64( +; CHECK: [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP2:%.*]]) +; CHECK: [[TMP3:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @exp(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f32(float* nocapture %varray) { +; CHECK-LABEL: @exp_f32( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_expf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @expf(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f64_intrinsic(double* nocapture %varray) { +; CHECK-LABEL: @exp_f64_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x double> @__svml_exp8(<8 x double> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.exp.f64(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f32_intrinsic(float* nocapture %varray) { +; CHECK-LABEL: @exp_f32_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_expf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.exp.f32(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f64(double* nocapture %varray) { +; CHECK-LABEL: @log_f64( +; CHECK: [[TMP1:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP2:%.*]]) +; CHECK: [[TMP3:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @log(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f32(float* nocapture %varray) { +; CHECK-LABEL: @log_f32( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_logf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @logf(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f64_intrinsic(double* nocapture %varray) { +; CHECK-LABEL: @log_f64_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x double> @__svml_log8(<8 x double> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.log.f64(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f32_intrinsic(float* nocapture %varray) { +; CHECK-LABEL: @log_f32_intrinsic( +; CHECK: [[TMP1:%.*]] = call <8 x float> @__svml_logf8(<8 x float> [[TMP2:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.log.f32(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind readnone } + Index: llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll @@ -0,0 +1,61 @@ +; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype. +; The C code used to generate this test: + +; #include +; +; void foo(double *a, int N){ +; int i; +; #pragma clang loop vectorize_width(8) +; for (i=0;i [[I0:%.*]] to <8 x double> +; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> +; CHECK-NEXT: [[I2:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[S1]]) +; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> +; CHECK-NEXT: [[I3:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[S2]]) +; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> +; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8 + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 { +entry: + %cmp5 = icmp sgt i32 %N, 0 + br i1 %cmp5, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %0 = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %0 to double + %call = tail call double @sin(double %conv) #2 + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %call, double* %arrayidx, align 8, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6 + +for.end: ; preds = %for.body, %entry + ret void +} + +; Function Attrs: nounwind +declare dso_local double @sin(double) local_unnamed_addr #1 + +!2 = !{!3, !3, i64 0} +!3 = !{!"double", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.vectorize.width", i32 8}