diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -309,16 +309,16 @@ /// Identify if the intrinsic is trivially vectorizable. /// This method returns true if the intrinsic's argument types are all scalars /// for the scalar form of the intrinsic and all vectors (or scalars handled by -/// hasVectorIntrinsicScalarOpd) for the vector form of the intrinsic. +/// isVectorIntrinsicWithScalarOpAtArg) for the vector form of the intrinsic. bool isTriviallyVectorizable(Intrinsic::ID ID); /// Identifies if the vector form of the intrinsic has a scalar operand. -bool hasVectorIntrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx); +bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, + unsigned ScalarOpdIdx); -/// Identifies if the vector form of the intrinsic has a scalar operand that has +/// Identifies if the vector form of the intrinsic has a operand that has /// an overloaded type. -bool hasVectorIntrinsicOverloadedScalarOpd(Intrinsic::ID ID, - unsigned ScalarOpdIdx); +bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, unsigned OpdIdx); /// Returns intrinsic ID for call. /// For the input call instruction it finds mapping intrinsic and returns diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -3036,7 +3036,7 @@ // Gather a column of constants. for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) { // Some intrinsics use a scalar type for certain arguments. - if (hasVectorIntrinsicScalarOpd(IntrinsicID, J)) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, J)) { Lane[J] = Operands[J]; continue; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -40,7 +40,7 @@ /// Return true if all of the intrinsic's arguments and return type are scalars /// for the scalar form of the intrinsic, and vectors for the vector form of the /// intrinsic (except operands that are marked as always being scalar by -/// hasVectorIntrinsicScalarOpd). +/// isVectorIntrinsicWithScalarOpAtArg). bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { switch (ID) { case Intrinsic::abs: // Begin integer bit-manipulation. @@ -89,6 +89,8 @@ case Intrinsic::fmuladd: case Intrinsic::powi: case Intrinsic::canonicalize: + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: return true; default: return false; @@ -96,8 +98,8 @@ } /// Identifies if the vector form of the intrinsic has a scalar operand. -bool llvm::hasVectorIntrinsicScalarOpd(Intrinsic::ID ID, - unsigned ScalarOpdIdx) { +bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, + unsigned ScalarOpdIdx) { switch (ID) { case Intrinsic::abs: case Intrinsic::ctlz: @@ -114,11 +116,14 @@ } } -bool llvm::hasVectorIntrinsicOverloadedScalarOpd(Intrinsic::ID ID, - unsigned ScalarOpdIdx) { +bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, + unsigned OpdIdx) { switch (ID) { + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: + return OpdIdx == 0; case Intrinsic::powi: - return (ScalarOpdIdx == 1); + return OpdIdx == 1; default: return false; } diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -109,7 +109,7 @@ auto *ArgType = Arg.value()->getType(); // Vector calls to intrinsics can still have // scalar operands for specific arguments. - if (hasVectorIntrinsicScalarOpd(IntrinsicID, Arg.index())) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) { ScalarTypes.push_back(ArgType); } else { // The argument in this place should be a vector if diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -575,9 +575,11 @@ if (OpI->getType()->isVectorTy()) { Scattered[I] = scatter(&CI, OpI); assert(Scattered[I].size() == NumElems && "mismatched call operands"); + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) + Tys.push_back(OpI->getType()->getScalarType()); } else { ScalarOperands[I] = OpI; - if (hasVectorIntrinsicOverloadedScalarOpd(ID, I)) + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) Tys.push_back(OpI->getType()); } } @@ -593,7 +595,7 @@ ScalarCallOps.clear(); for (unsigned J = 0; J != NumArgs; ++J) { - if (hasVectorIntrinsicScalarOpd(ID, J)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) ScalarCallOps.push_back(ScalarOperands[J]); else ScalarCallOps.push_back(Scattered[J][Elem]); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -772,7 +772,7 @@ auto *SE = PSE.getSE(); Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) - if (hasVectorIntrinsicScalarOpd(IntrinID, i)) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) { if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { reportVectorizationFailure("Found unvectorizable intrinsic", "intrinsic instruction cannot be vectorized", diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4244,13 +4244,13 @@ // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; - if (!UseVectorIntrinsic || !hasVectorIntrinsicScalarOpd(ID, I.index())) + if (!UseVectorIntrinsic || + !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) Arg = State.get(I.value(), Part); - else { + else Arg = State.get(I.value(), VPIteration(0, 0)); - if (hasVectorIntrinsicOverloadedScalarOpd(ID, I.index())) - TysForDecl.push_back(Arg->getType()); - } + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) + TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -641,7 +641,7 @@ CallInst *CI = cast(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { - if (hasVectorIntrinsicScalarOpd(ID, i)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) return (CI->getArgOperand(i) == Scalar); } LLVM_FALLTHROUGH; @@ -4855,7 +4855,7 @@ unsigned NumArgs = CI->arg_size(); SmallVector ScalarArgs(NumArgs, nullptr); for (unsigned j = 0; j != NumArgs; ++j) - if (hasVectorIntrinsicScalarOpd(ID, j)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); for (Value *V : VL) { CallInst *CI2 = dyn_cast(V); @@ -4874,7 +4874,7 @@ // Some intrinsics have scalar arguments and should be same in order for // them to be vectorized. for (unsigned j = 0; j != NumArgs; ++j) { - if (hasVectorIntrinsicScalarOpd(ID, j)) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) { Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); @@ -4907,7 +4907,7 @@ for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { // For scalar operands no need to to create an entry since no need to // vectorize it. - if (hasVectorIntrinsicScalarOpd(ID, i)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) continue; ValueList Operands; // Prepare the operand vector. @@ -7486,11 +7486,11 @@ ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (UseIntrinsic && hasVectorIntrinsicScalarOpd(IID, j)) { + if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) { CallInst *CEI = cast(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); - if (hasVectorIntrinsicOverloadedScalarOpd(IID, j)) + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) TysForDecl.push_back(ScalarArg->getType()); continue; } @@ -7498,6 +7498,8 @@ Value *OpVec = vectorizeTree(E->getOperand(j)); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) + TysForDecl.push_back(OpVec->getType()); } Function *CF; diff --git a/llvm/test/Transforms/LoopVectorize/fpsat.ll b/llvm/test/Transforms/LoopVectorize/fpsat.ll --- a/llvm/test/Transforms/LoopVectorize/fpsat.ll +++ b/llvm/test/Transforms/LoopVectorize/fpsat.ll @@ -8,21 +8,53 @@ ; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[Y:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[Y]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[X]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.fptosi.sat.i32.f32(float [[TMP0]]) -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.fptosi.sat.i32.f32(float [[TMP8]]) +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[TMP9]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; entry: %cmp6 = icmp sgt i32 %n, 0 @@ -54,21 +86,53 @@ ; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[Y:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[Y]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[X]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !alias.scope !8 +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4, !alias.scope !11, !noalias !8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.fptoui.sat.i32.f32(float [[TMP0]]) -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.fptoui.sat.i32.f32(float [[TMP8]]) +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[TMP9]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; entry: %cmp6 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fpsat.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fpsat.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/fpsat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fpsat.ll @@ -6,24 +6,9 @@ define void @signed(ptr %x, ptr %y, i32 %n) { ; CHECK-LABEL: @signed( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1 -; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2 -; CHECK-NEXT: [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3 -; CHECK-NEXT: [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4 -; CHECK-NEXT: [[L1:%.*]] = tail call i32 @llvm.fptosi.sat.i32.f32(float [[L0]]) -; CHECK-NEXT: [[L3:%.*]] = tail call i32 @llvm.fptosi.sat.i32.f32(float [[L2]]) -; CHECK-NEXT: [[L5:%.*]] = tail call i32 @llvm.fptosi.sat.i32.f32(float [[L4]]) -; CHECK-NEXT: [[L7:%.*]] = tail call i32 @llvm.fptosi.sat.i32.f32(float [[L6]]) -; CHECK-NEXT: store i32 [[L1]], ptr [[Y:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 1 -; CHECK-NEXT: store i32 [[L3]], ptr [[ARRAYIDX2_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2 -; CHECK-NEXT: store i32 [[L5]], ptr [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 3 -; CHECK-NEXT: store i32 [[L7]], ptr [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]]) +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[Y:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -51,24 +36,9 @@ define void @unsigned(ptr %x, ptr %y, i32 %n) { ; CHECK-LABEL: @unsigned( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1 -; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2 -; CHECK-NEXT: [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3 -; CHECK-NEXT: [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4 -; CHECK-NEXT: [[L1:%.*]] = tail call i32 @llvm.fptoui.sat.i32.f32(float [[L0]]) -; CHECK-NEXT: [[L3:%.*]] = tail call i32 @llvm.fptoui.sat.i32.f32(float [[L2]]) -; CHECK-NEXT: [[L5:%.*]] = tail call i32 @llvm.fptoui.sat.i32.f32(float [[L4]]) -; CHECK-NEXT: [[L7:%.*]] = tail call i32 @llvm.fptoui.sat.i32.f32(float [[L6]]) -; CHECK-NEXT: store i32 [[L1]], ptr [[Y:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 1 -; CHECK-NEXT: store i32 [[L3]], ptr [[ARRAYIDX2_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2 -; CHECK-NEXT: store i32 [[L5]], ptr [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 3 -; CHECK-NEXT: store i32 [[L7]], ptr [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]]) +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[Y:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/Scalarizer/intrinsics.ll b/llvm/test/Transforms/Scalarizer/intrinsics.ll --- a/llvm/test/Transforms/Scalarizer/intrinsics.ll +++ b/llvm/test/Transforms/Scalarizer/intrinsics.ll @@ -24,6 +24,9 @@ declare <2 x i32> @llvm.smul.fix.sat.v2i32(<2 x i32>, <2 x i32>, i32) declare <2 x i32> @llvm.umul.fix.sat.v2i32(<2 x i32>, <2 x i32>, i32) +declare <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float>) +declare <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float>) + ; CHECK-LABEL: @scalarize_sqrt_v2f32( ; CHECK: %sqrt.i0 = call float @llvm.sqrt.f32(float %x.i0) @@ -134,3 +137,25 @@ %umulfixsat = call <2 x i32> @llvm.umul.fix.sat.v2i32(<2 x i32> %x, <2 x i32> , i32 31) ret <2 x i32> %umulfixsat } + +; CHECK-LABEL: @scalarize_fptosi_sat( +; CHECK: %sat.i0 = call i32 @llvm.fptosi.sat.i32.f32(float %x.i0) +; CHECK: %sat.i1 = call i32 @llvm.fptosi.sat.i32.f32(float %x.i1) +; CHECK: %sat.upto0 = insertelement <2 x i32> poison, i32 %sat.i0, i32 0 +; CHECK: %sat = insertelement <2 x i32> %sat.upto0, i32 %sat.i1, i32 1 +; CHECK: ret <2 x i32> %sat +define <2 x i32> @scalarize_fptosi_sat(<2 x float> %x) #0 { + %sat = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> %x) + ret <2 x i32> %sat +} + +; CHECK-LABEL: @scalarize_fptoui_sat( +; CHECK: %sat.i0 = call i32 @llvm.fptoui.sat.i32.f32(float %x.i0) +; CHECK: %sat.i1 = call i32 @llvm.fptoui.sat.i32.f32(float %x.i1) +; CHECK: %sat.upto0 = insertelement <2 x i32> poison, i32 %sat.i0, i32 0 +; CHECK: %sat = insertelement <2 x i32> %sat.upto0, i32 %sat.i1, i32 1 +; CHECK: ret <2 x i32> %sat +define <2 x i32> @scalarize_fptoui_sat(<2 x float> %x) #0 { + %sat = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> %x) + ret <2 x i32> %sat +}