diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -161,21 +161,35 @@ AK_MatchType = 7 }; + /// Returns true if type of this argument or return value depends on the + /// type of other argument. + bool hasBoundType() const { + switch (Kind) { + case Argument: + case ExtendArgument: + case TruncArgument: + case HalfVecArgument: + case SameVecWidthArgument: + case PtrToArgument: + case PtrToElt: + case VecOfAnyPtrsToElt: + case VecElementArgument: + case Subdivide2Argument: + case Subdivide4Argument: + case VecOfBitcastsToInt: + return true; + default: + return false; + } + llvm_unreachable("unhandled"); + } + unsigned getArgumentNumber() const { - assert(Kind == Argument || Kind == ExtendArgument || - Kind == TruncArgument || Kind == HalfVecArgument || - Kind == SameVecWidthArgument || Kind == PtrToArgument || - Kind == PtrToElt || Kind == VecElementArgument || - Kind == Subdivide2Argument || Kind == Subdivide4Argument || - Kind == VecOfBitcastsToInt); + assert(hasBoundType() && Kind != VecOfAnyPtrsToElt); return Argument_Info >> 3; } ArgKind getArgumentKind() const { - assert(Kind == Argument || Kind == ExtendArgument || - Kind == TruncArgument || Kind == HalfVecArgument || - Kind == SameVecWidthArgument || Kind == PtrToArgument || - Kind == VecElementArgument || Kind == Subdivide2Argument || - Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt); + assert(hasBoundType() && Kind != VecOfAnyPtrsToElt); return (ArgKind)(Argument_Info & 7); } @@ -213,6 +227,13 @@ /// of IITDescriptors. void getIntrinsicInfoTableEntries(ID id, SmallVectorImpl &T); + /// Given the IIT table descriptor skips the descriptors corresponding to one + /// type element (either return value or an argument). + /// + /// The function is used to iterate through intrinsic description without + /// building types. + void popFirstElement(ArrayRef &Infos); + enum MatchIntrinsicTypesResult { MatchIntrinsicTypes_Match = 0, MatchIntrinsicTypes_NoMatchRet = 1, diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -89,6 +89,7 @@ case Intrinsic::fmuladd: case Intrinsic::powi: case Intrinsic::canonicalize: + case Intrinsic::isnan: return true; default: return false; diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1222,6 +1222,49 @@ DecodeIITType(NextElt, IITEntries, IIT_Done, T); } +void Intrinsic::popFirstElement(ArrayRef &Infos) { + assert(!Infos.empty()); + + IITDescriptor D = Infos.front(); + Infos = Infos.slice(1); + switch (D.Kind) { + case IITDescriptor::Void: + case IITDescriptor::VarArg: + case IITDescriptor::MMX: + case IITDescriptor::AMX: + case IITDescriptor::Token: + case IITDescriptor::Metadata: + case IITDescriptor::Half: + case IITDescriptor::BFloat: + case IITDescriptor::Float: + case IITDescriptor::Double: + case IITDescriptor::Quad: + case IITDescriptor::Integer: + case IITDescriptor::Argument: + case IITDescriptor::ExtendArgument: + case IITDescriptor::TruncArgument: + case IITDescriptor::Subdivide2Argument: + case IITDescriptor::Subdivide4Argument: + case IITDescriptor::HalfVecArgument: + case IITDescriptor::PtrToArgument: + case IITDescriptor::PtrToElt: + case IITDescriptor::VecElementArgument: + case IITDescriptor::VecOfBitcastsToInt: + case IITDescriptor::VecOfAnyPtrsToElt: + return; + case IITDescriptor::Vector: + case IITDescriptor::Pointer: + case IITDescriptor::SameVecWidthArgument: + popFirstElement(Infos); + return; + case IITDescriptor::Struct: + for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i) + popFirstElement(Infos); + return; + } + llvm_unreachable("unhandled"); +} + static Type *DecodeFixedType(ArrayRef &Infos, ArrayRef Tys, LLVMContext &Context) { using namespace Intrinsic; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5532,21 +5532,52 @@ bool UseIntrinsic = ID != Intrinsic::not_intrinsic && VecCallCosts.first <= VecCallCosts.second; + SmallVector Table; + ArrayRef TableRef; + if (UseIntrinsic) { + getIntrinsicInfoTableEntries(ID, Table); + TableRef = Table; + } + SmallVector TysForDecl; + + const auto UpdateTysForDecl = [&](Value *V, bool IsScalar) { + if (!UseIntrinsic) + return; + const Intrinsic::IITDescriptor &Item = TableRef.front(); + switch (Item.Kind) { + case Intrinsic::IITDescriptor::Argument: + if (Item.getArgumentNumber() >= TysForDecl.size()) { + if (IsScalar) + TysForDecl.push_back(V->getType()); + else + TysForDecl.push_back( + FixedVectorType::get(V->getType(), E->Scalars.size())); + } + break; + case Intrinsic::IITDescriptor::SameVecWidthArgument: + break; + default: + assert(!Item.hasBoundType()); + } + popFirstElement(TableRef); + }; + + UpdateTysForDecl(CI, false); Value *ScalarArg = nullptr; std::vector OpVecs; - SmallVector TysForDecl = - {FixedVectorType::get(CI->getType(), E->Scalars.size())}; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { - ValueList OpVL; - // Some intrinsics have scalar arguments. This argument should not be - // vectorized. - if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { - CallInst *CEI = cast(VL0); - ScalarArg = CEI->getArgOperand(j); - OpVecs.push_back(CEI->getArgOperand(j)); - if (hasVectorInstrinsicOverloadedScalarOpd(IID, j)) - TysForDecl.push_back(ScalarArg->getType()); - continue; + if (UseIntrinsic) { + // Some intrinsics have scalar arguments. This argument should not be + // vectorized. + bool IsScalar = hasVectorInstrinsicScalarOpd(IID, j); + if (IsScalar) { + CallInst *CEI = cast(VL0); + ScalarArg = CEI->getArgOperand(j); + OpVecs.push_back(CEI->getArgOperand(j)); + } + UpdateTysForDecl(CI->getArgOperand(j), IsScalar); + if (IsScalar) + continue; } Value *OpVec = vectorizeTree(E->getOperand(j)); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll @@ -195,25 +195,25 @@ ; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]] -; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #[[ATTR5:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1 ; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]] -; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #[[ATTR4]] +; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 ; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 ; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]] -; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #[[ATTR4]] +; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 ; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 ; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]] -; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #[[ATTR4]] +; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #[[ATTR5]] ; CHECK-NEXT: store i32 [[CALL1]], i32* [[C:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1 ; CHECK-NEXT: store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4 @@ -322,25 +322,25 @@ ; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]] -; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #[[ATTR4]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1 ; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]] -; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #[[ATTR4]] +; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 ; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 ; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]] -; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #[[ATTR4]] +; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 ; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 ; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]] -; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #[[ATTR4]] +; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #[[ATTR5]] ; CHECK-NEXT: store i32 [[CALL1]], i32* [[C:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1 ; CHECK-NEXT: store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4 @@ -448,25 +448,25 @@ ; CHECK-NEXT: [[I0:%.*]] = load float, float* [[A:%.*]], align 4 ; CHECK-NEXT: [[I1:%.*]] = load float, float* [[B:%.*]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[I0]], [[I1]] -; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #[[ATTR4]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i32 1 ; CHECK-NEXT: [[I3:%.*]] = load float, float* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[I2]], [[I3]] -; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[Q:%.*]]) #[[ATTR4]] +; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[Q:%.*]]) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2 ; CHECK-NEXT: [[I4:%.*]] = load float, float* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[B]], i32 2 ; CHECK-NEXT: [[I5:%.*]] = load float, float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[I4]], [[I5]] -; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #[[ATTR4]] +; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3 ; CHECK-NEXT: [[I6:%.*]] = load float, float* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i32 3 ; CHECK-NEXT: [[I7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[I6]], [[I7]] -; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[Q]]) #[[ATTR4]] +; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[Q]]) #[[ATTR5]] ; CHECK-NEXT: store float [[CALL1]], float* [[C:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1 ; CHECK-NEXT: store float [[CALL2]], float* [[ARRAYIDX8]], align 4 @@ -520,20 +520,16 @@ define void @vec_isnan_f64(double* %a, double* %b, double* %c, double* %d) { ; CHECK-LABEL: @vec_isnan_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[AIDX1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 -; CHECK-NEXT: [[A0:%.*]] = load double, double* [[A]], align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, double* [[AIDX1]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[ISNAN0:%.*]] = tail call i1 @llvm.isnan.f64(double [[A0]]) -; CHECK-NEXT: [[ISNAN1:%.*]] = tail call i1 @llvm.isnan.f64(double [[A1]]) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> poison, i1 [[ISNAN0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i1> [[TMP4]], i1 [[ISNAN1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> [[TMP1]], <2 x double> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[D:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i1> @llvm.isnan.v2f64(<2 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x double> [[TMP3]], <2 x double> [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[D:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: ret void ; entry: