diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7188,7 +7188,6 @@ case Instruction::Mul: case Instruction::FMul: case Instruction::FDiv: - case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: @@ -7221,6 +7220,64 @@ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, Op2Info, Operands, I); } + case Instruction::FRem: { + // Certain instructions can be cheaper to vectorize if they have a constant + // second vector operand. One example of this are shifts on x86. + Value *Op2 = I->getOperand(1); + auto Op2Info = TTI.getOperandInfo(Op2); + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + Legal->isInvariant(Op2)) + Op2Info.Kind = TargetTransformInfo::OK_UniformValue; + + SmallVector Operands(I->operand_values()); + InstructionCost Cost = TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Op2Info, Operands, I); + if (Cost != InstructionCost::getInvalid()) + return Cost; + // We need to check if we have a lib function available as we don't want + // to emit frem instructions operation on scalable vectors for targets + // on which such instructions can not be code generated. + if (VF.isScalable()) { + if (TLI) { + Module *M = I->getModule(); + StringRef ScalarFnName; + Type *Ty = I->getType(); + if (Ty->isFloatTy()) + ScalarFnName = TLI->getName(LibFunc_fmodf); + else if (Ty->isDoubleTy()) + ScalarFnName = TLI->getName(LibFunc_fmod); + else + return InstructionCost::getInvalid(); + Type *RetTy = ToVectorTy(Ty, VF); + SmallVector Tys = {RetTy, RetTy}; + Function *TLIFunc = nullptr; + StringRef TLIName = TLI->getVectorizedFunction(ScalarFnName, VF); + if (TLIName.empty()) { + TLIName = TLI->getVectorizedFunction(ScalarFnName, VF, true); + if (TLIName.empty()) + return InstructionCost::getInvalid(); + // Get the mask position. + std::optional Info = + VFABI::tryDemangleForVFABI(TLIName, *M, VF); + if (!Info) + return InstructionCost::getInvalid(); + unsigned MaskPos = Info->getParamIndexForOptionalMask().value(); + Tys.insert(Tys.begin() + MaskPos, + VectorType::get(Type::getInt1Ty(M->getContext()), VF)); + } + TLIFunc = Function::Create(FunctionType::get(RetTy, Tys, false), + Function::ExternalLinkage, ScalarFnName, *M); + if (TLIFunc == nullptr) + return InstructionCost::getInvalid(); + return TTI.getCallInstrCost(TLIFunc, RetTy, Tys, + TTI::TCK_RecipThroughput); + } + return InstructionCost::getInvalid(); + } + return InstructionCost::getInvalid(); + } case Instruction::FNeg: { return TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/frem.ll b/llvm/test/Transforms/LoopVectorize/AArch64/frem.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/frem.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -mtriple aarch64-linux-generic -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize,instcombine -S < %s | FileCheck %s + +define void @fmod_vec(ptr noalias nocapture %a, +; CHECK-LABEL: define void @fmod_vec +; CHECK-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 [[TMP2]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = frem fast [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 0x40091EB860000000, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP5:%.*]] = frem fast [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, double 0x40091EB860000000, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP4]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP8]] +; CHECK-NEXT: store [[TMP5]], ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; + ptr noalias nocapture readonly %b) #0 { +entry: + br label %for.body +for.body: ; preds = %entry, %for.body + %i = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %b, i64 %i + %0 = load double, ptr %arrayidx, align 8 + %1 = frem fast double %0, 0x40091EB860000000 + %arrayidx2 = getelementptr inbounds double, ptr %a, i64 %i + store double %1, ptr %arrayidx2, align 8 + %inc = add nuw nsw i64 %i, 1 + %cmp = icmp ult i64 %inc, 256 + br i1 %cmp, label %for.body, label %for.end +for.end: ; preds = %for.body + ret void +} + +define void @fmodf_vec(ptr noalias nocapture %a, +; CHECK-LABEL: define void @fmodf_vec +; CHECK-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 [[TMP2]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = frem fast [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 0x40091EB860000000, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP5:%.*]] = frem fast [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, float 0x40091EB860000000, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP4]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]] +; CHECK-NEXT: store [[TMP5]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; + ptr noalias nocapture readonly %b) #0 { +entry: + br label %for.body +for.body: ; preds = %entry, %for.body + %i = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %i + %0 = load float, ptr %arrayidx, align 4 + %1 = frem fast float %0, 0x40091EB860000000 + %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %i + store float %1, ptr %arrayidx2, align 4 + %inc = add nuw nsw i64 %i, 1 + %cmp = icmp ult i64 %inc, 256 + br i1 %cmp, label %for.body, label %for.end +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { vscale_range(1,16) }