diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -320,6 +320,10 @@ /// an overloaded type. bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, unsigned OpdIdx); +/// Identifies if the vector form of the intrinsic uses return type as the +/// first overloaded type. +bool isVectorIntrinsicWithReturnOverloadType(Intrinsic::ID ID); + /// Returns intrinsic ID for call. /// For the input call instruction it finds mapping intrinsic and returns /// its intrinsic ID, in case it does not found it return not_intrinsic. diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -91,6 +91,7 @@ case Intrinsic::canonicalize: case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::is_fpclass: return true; default: return false; @@ -105,6 +106,7 @@ case Intrinsic::ctlz: case Intrinsic::cttz: case Intrinsic::powi: + case Intrinsic::is_fpclass: return (ScalarOpdIdx == 1); case Intrinsic::smul_fix: case Intrinsic::smul_fix_sat: @@ -121,6 +123,7 @@ switch (ID) { case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::is_fpclass: return OpdIdx == 0; case Intrinsic::powi: return OpdIdx == 1; @@ -129,6 +132,17 @@ } } +bool llvm::isVectorIntrinsicWithReturnOverloadType(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::is_fpclass: // The return element type is i1 and the vector + // width of return type matches with the first + // operand. + return false; + default: + return true; + } +} + /// Returns intrinsic ID for call. /// For the input call instruction it finds mapping intrinsic and returns /// its ID, in case it does not found it return not_intrinsic. diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -579,7 +579,8 @@ Scattered.resize(NumArgs); SmallVector Tys; - Tys.push_back(VT->getScalarType()); + if (isVectorIntrinsicWithReturnOverloadType(ID)) + Tys.push_back(VT->getScalarType()); // Assumes that any vector type has the same number of elements as the return // vector type, which is true for all current intrinsics. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4179,8 +4179,11 @@ assert((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"); + bool UseReturnOverloadType = isVectorIntrinsicWithReturnOverloadType(ID); for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector TysForDecl = {CI.getType()}; + SmallVector TysForDecl; + if (UseReturnOverloadType) + TysForDecl.push_back(CI.getType()); SmallVector Args; for (const auto &I : enumerate(ArgOperands.operands())) { // Some intrinsics have a scalar argument - don't replace it with a @@ -4199,7 +4202,7 @@ Function *VectorF; if (UseVectorIntrinsic) { // Use vector version of the intrinsic. - if (VF.isVector()) + if (UseReturnOverloadType && VF.isVector()) TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF); Module *M = State.Builder.GetInsertBlock()->getModule(); VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8291,8 +8291,10 @@ Value *ScalarArg = nullptr; std::vector OpVecs; - SmallVector TysForDecl = - {FixedVectorType::get(CI->getType(), E->Scalars.size())}; + SmallVector TysForDecl; + if (isVectorIntrinsicWithReturnOverloadType(IID)) + TysForDecl.push_back( + FixedVectorType::get(CI->getType(), E->Scalars.size())); for (int j = 0, e = CI->arg_size(); j < e; ++j) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be diff --git a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s + +define void @is_fpclass(ptr %x, ptr %y, i32 %n) { +; CHECK-LABEL: @is_fpclass( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X2:%.*]] = ptrtoint ptr [[X:%.*]] to i64 +; CHECK-NEXT: [[Y1:%.*]] = ptrtoint ptr [[Y:%.*]] to i64 +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[Y1]], [[X2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[WIDE_LOAD]], i32 1) +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[TMP9]], i32 1) +; CHECK-NEXT: [[TMP11:%.*]] = zext i1 [[TMP10]] to i32 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[TMP11]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %1 = tail call i1 @llvm.is.fpclass.f32(float %0, i32 1) + %2 = zext i1 %1 to i32 + %arrayidx2 = getelementptr inbounds i32, ptr %y, i64 %indvars.iv + store i32 %2, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare i1 @llvm.is.fpclass.f32(float, i32) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll @@ -195,25 +195,25 @@ ; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]] -; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #[[ATTR5:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1 ; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]] -; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #[[ATTR3]] +; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 ; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 ; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]] -; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #[[ATTR3]] +; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 ; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 ; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]] -; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #[[ATTR3]] +; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #[[ATTR5]] ; CHECK-NEXT: store i32 [[CALL1]], i32* [[C:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1 ; CHECK-NEXT: store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4 @@ -322,25 +322,25 @@ ; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]] -; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #[[ATTR3]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1 ; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]] -; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #[[ATTR3]] +; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 ; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 ; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]] -; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #[[ATTR3]] +; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 ; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 ; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]] -; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #[[ATTR3]] +; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #[[ATTR5]] ; CHECK-NEXT: store i32 [[CALL1]], i32* [[C:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1 ; CHECK-NEXT: store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4 @@ -448,25 +448,25 @@ ; CHECK-NEXT: [[I0:%.*]] = load float, float* [[A:%.*]], align 4 ; CHECK-NEXT: [[I1:%.*]] = load float, float* [[B:%.*]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[I0]], [[I1]] -; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #[[ATTR3]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i32 1 ; CHECK-NEXT: [[I3:%.*]] = load float, float* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[I2]], [[I3]] -; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[Q:%.*]]) #[[ATTR3]] +; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[Q:%.*]]) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2 ; CHECK-NEXT: [[I4:%.*]] = load float, float* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[B]], i32 2 ; CHECK-NEXT: [[I5:%.*]] = load float, float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[I4]], [[I5]] -; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #[[ATTR3]] +; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #[[ATTR5]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3 ; CHECK-NEXT: [[I6:%.*]] = load float, float* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i32 3 ; CHECK-NEXT: [[I7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[I6]], [[I7]] -; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[Q]]) #[[ATTR3]] +; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[Q]]) #[[ATTR5]] ; CHECK-NEXT: store float [[CALL1]], float* [[C:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1 ; CHECK-NEXT: store float [[CALL2]], float* [[ARRAYIDX8]], align 4 @@ -513,3 +513,61 @@ ret void } + + +declare i1 @llvm.is.fpclass.f32(float, i32) + +define void @vec_is_fpclass(float* %a, float* %b, i32* %c) { +; CHECK-LABEL: @vec_is_fpclass( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP4]], i32 1) +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[C:%.*]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i0 = load float, float* %a, align 4 + %i1 = load float, float* %b, align 4 + %add1 = fadd float %i0, %i1 + %call1 = tail call i1 @llvm.is.fpclass.f32(float %add1, i32 1) + %call1.ext = zext i1 %call1 to i32 + + %arrayidx2 = getelementptr inbounds float, float* %a, i32 1 + %i2 = load float, float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float, float* %b, i32 1 + %i3 = load float, float* %arrayidx3, align 4 + %add2 = fadd float %i2, %i3 + %call2 = tail call i1 @llvm.is.fpclass.f32(float %add2, i32 1) + %call2.ext = zext i1 %call2 to i32 + + %arrayidx4 = getelementptr inbounds float, float* %a, i32 2 + %i4 = load float, float* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds float, float* %b, i32 2 + %i5 = load float, float* %arrayidx5, align 4 + %add3 = fadd float %i4, %i5 + %call3 = tail call i1 @llvm.is.fpclass.f32(float %add3, i32 1) + %call3.ext = zext i1 %call3 to i32 + + %arrayidx6 = getelementptr inbounds float, float* %a, i32 3 + %i6 = load float, float* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds float, float* %b, i32 3 + %i7 = load float, float* %arrayidx7, align 4 + %add4 = fadd float %i6, %i7 + %call4 = tail call i1 @llvm.is.fpclass.f32(float %add4, i32 1) + %call4.ext = zext i1 %call4 to i32 + + store i32 %call1.ext, i32* %c, align 4 + %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1 + store i32 %call2.ext, i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2 + store i32 %call3.ext, i32* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3 + store i32 %call4.ext, i32* %arrayidx10, align 4 + ret void +} diff --git a/llvm/test/Transforms/Scalarizer/intrinsics.ll b/llvm/test/Transforms/Scalarizer/intrinsics.ll --- a/llvm/test/Transforms/Scalarizer/intrinsics.ll +++ b/llvm/test/Transforms/Scalarizer/intrinsics.ll @@ -27,6 +27,8 @@ declare <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float>) declare <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float>) +; Unary fp plus constant scalar operand, plus non-overloaded return type +declare <2 x i1> @llvm.is.fpclass.v2f32(<2 x float>, i32) ; CHECK-LABEL: @scalarize_sqrt_v2f32( ; CHECK: %sqrt.i0 = call float @llvm.sqrt.f32(float %x.i0) @@ -159,3 +161,14 @@ %sat = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> %x) ret <2 x i32> %sat } + +; CHECK-LABEL: @scalarize_is_fpclass( +; CHECK: %cond.i0 = call i1 @llvm.is.fpclass.f32(float %x.i0, i32 1) +; CHECK: %cond.i1 = call i1 @llvm.is.fpclass.f32(float %x.i1, i32 1) +; CHECK: %cond.upto0 = insertelement <2 x i1> poison, i1 %cond.i0, i32 0 +; CHECK: %cond = insertelement <2 x i1> %cond.upto0, i1 %cond.i1, i32 1 +; CHECK: ret <2 x i1> %cond +define <2 x i1> @scalarize_is_fpclass(<2 x float> %x) { + %cond = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 1) + ret <2 x i1> %cond +}