diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -14405,6 +14405,8 @@ When specified with the fast-math-flag 'afn', the result may be approximated using a less accurate calculation. +.. _int_powi: + '``llvm.powi.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -20055,6 +20057,55 @@ %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison +.. _int_vp_powi: + +'``llvm.vp.powi.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x float> @llvm.vp.sqrt.v16f32.i32 (<16 x float> , i32 , <16 x i1> , i32 ) + declare @llvm.vp.sqrt.nxv4f32.i32 ( , i32 , , i32 ) + declare <256 x double> @llvm.vp.sqrt.v256f64.i64 (<256 x double> , i64 , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated floating-point square root of a vector of floating-point values. + + +Arguments: +"""""""""" + +The first operand and the result have the same vector of floating-point type. +The second oeprand is an integer power. The third operand is the vector mask and +has the same number of elements as the result vector type. The fourth operand is +the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.powi``' intrinsic performs floating-point powi (:ref:`powi `) of +the first vector operand on each enabled lane with the second operand as +exponent. The result on disabled lanes is a :ref:`poison value `. +The operation is performed in the default floating-point environment. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x float> @llvm.vp.powi.v4f32.i32(<4 x float> %a, i32 %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = call <4 x float> @llvm.powi.v4f32(<4 x float> %a, i32 %b) + %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison + + .. _int_vp_fma: '``llvm.vp.fma.*``' Intrinsics diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def --- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -45,6 +45,7 @@ FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true)) FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass, ()) FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass, ()) +FUNCTION_PASS("expand-powi", ExpandPowiPass, ()) FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ()) FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ()) FUNCTION_PASS("lowerinvoke", LowerInvokePass, ()) diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -518,6 +518,9 @@ // Expands large div/rem instructions. FunctionPass *createExpandLargeFpConvertPass(); + // Expands powi instructions. + FunctionPass *createExpandPowiPass(); + // This pass expands memcmp() to load/stores. FunctionPass *createExpandMemCmpPass(); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1682,6 +1682,11 @@ [ LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; + def int_vp_powi : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + llvm_anyint_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; // Casts def int_vp_trunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -364,6 +364,9 @@ BEGIN_REGISTER_VP(vp_nearbyint, 1, 2, VP_FNEARBYINT, -1) END_REGISTER_VP(vp_nearbyint, VP_FNEARBYINT) +// llvm.vp.powi(x, y, mask,vlen) +BEGIN_REGISTER_VP_INTRINSIC(vp_powi, 2, 3) +END_REGISTER_VP_INTRINSIC(vp_powi) ///// } Floating-Point Arithmetic ///// Type Casts { diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -126,6 +126,7 @@ void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&); void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); +void initializeExpandPowiLegacyPassPass(PassRegistry &); void initializeExpandReductionsPass(PassRegistry&); void initializeExpandVectorPredicationPass(PassRegistry &); void initializeMakeGuardsExplicitLegacyPassPass(PassRegistry&); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -59,6 +59,7 @@ ExpandLargeFpConvert.cpp ExpandMemCmp.cpp ExpandPostRAPseudos.cpp + ExpandPowi.cpp ExpandReductions.cpp ExpandVectorPredication.cpp FaultMaps.cpp diff --git a/llvm/lib/CodeGen/ExpandPowi.cpp b/llvm/lib/CodeGen/ExpandPowi.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ExpandPowi.cpp @@ -0,0 +1,172 @@ +//===--- ExpandPowi.cpp - Expand Powi intrinsics ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR expansion for powi/vp.powi. The expansion is based on +// compiler-rt/__powidf2.c. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +#define DEBUG_TYPE "expand-powi" + +using namespace llvm; + +// Create Value of maximum effective vector length of I. +static Value *getMaxEVL(Instruction *I) { + assert(isa(I->getType()) && + "Only serve scalable llvm.powi now."); + + ElementCount EC = cast(I->getType())->getElementCount(); + IRBuilder<> Builder(I); + Type *Int32Ty = Type::getInt32Ty(I->getContext()); + Value *VScale = Builder.CreateIntrinsic(Int32Ty, Intrinsic::vscale, {}); + Value *FactorConst = Builder.getInt32(EC.getKnownMinValue()); + return Builder.CreateMul(VScale, FactorConst, "scalable_size", + /*NUW*/ true, /*NSW*/ false); +} + +// The expansion is based on the c code of compiler-rt/__powidf2.c, +// const int recip = b < 0; +// double r = 1; +// while (1) { +// if (b & 1) +// r *= a; +// b /= 2; +// if (b == 0) +// break; +// a *= a; +// } +// return recip ? 1 / r : r; +// +// Expansion of llvm.powi still uses vp intrinsics here. It regards llvm.powi as +// llvm.vp.powi with true mask and maximum vl. +static void expandPowi(IntrinsicInst *II) { + LLVMContext &C = II->getContext(); + Value *OrigBase = II->getOperand(0); + Value *OrigExp = II->getOperand(1); + VectorType *BaseTy = cast(OrigBase->getType()); + Type *ExpTy = OrigExp->getType(); + Type *CondTy = BaseTy->getWithNewType(Type::getInt1Ty(C)); + Value *True = ConstantInt::get(CondTy, 1); + Value *Mask, *EVL; + if (II->getIntrinsicID() == Intrinsic::vp_powi) { + Mask = II->getOperand(2); + EVL = II->getOperand(3); + } else { + assert(II->getIntrinsicID() == Intrinsic::powi); + Mask = True; + EVL = getMaxEVL(II); + } + + BasicBlock *PreLoopBB = II->getParent(); + BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(II, "powi-post-loop"); + BasicBlock *LoopBody = + BasicBlock::Create(PreLoopBB->getContext(), "powi-expansion-loop", + PreLoopBB->getParent(), PostLoopBB); + + IRBuilder<> Builder(PreLoopBB->getTerminator()); + Builder.CreateBr(LoopBody); + PreLoopBB->getTerminator()->eraseFromParent(); + + Builder.SetInsertPoint(LoopBody); + // Create phi of base. + PHINode *Base = Builder.CreatePHI(BaseTy, 2, "base"); + Base->addIncoming(OrigBase, PreLoopBB); + // Create phi of exponent. + PHINode *Exp = Builder.CreatePHI(ExpTy, 2, "exp"); + Exp->addIncoming(OrigExp, PreLoopBB); + // Create phi of res. + PHINode *Res = Builder.CreatePHI(BaseTy, 2, "res"); + Res->addIncoming(ConstantFP::get(BaseTy, 1.), PreLoopBB); + // Res *= Base if Exp is odd. + Value *Tmp = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_fmul, + {Res, Base, True, EVL}); + Value *And1 = Builder.CreateAnd(Exp, ConstantInt::get(ExpTy, 1)); + Value *IsOdd = Builder.CreateICmpNE(And1, ConstantInt::get(ExpTy, 0)); + Value *IsOddVec = Builder.CreateVectorSplat(BaseTy->getElementCount(), IsOdd); + Value *NewRes = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_select, + {IsOddVec, Tmp, Res, EVL}); + Res->addIncoming(NewRes, LoopBody); + // Update Exp. + Value *NewExp = Builder.CreateLShr(Exp, ConstantInt::get(ExpTy, 1)); + Exp->addIncoming(NewExp, LoopBody); + // Update Base. + Value *NewBase = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_fmul, + {Base, Base, True, EVL}); + Base->addIncoming(NewBase, LoopBody); + // Check whether NewExp is zero. + Builder.CreateCondBr(Builder.CreateICmpEQ(NewExp, ConstantInt::get(ExpTy, 1)), + PostLoopBB, LoopBody); + + Builder.SetInsertPoint(&PostLoopBB->front()); + // Use reciprocal if power is negative. + Value *Recip = + Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_fdiv, + {ConstantFP::get(BaseTy, 1.), NewRes, Mask, EVL}); + Value *IsNegative = + Builder.CreateICmpSLT(OrigExp, ConstantInt::get(ExpTy, 0)); + Value *IsNegativeVec = + Builder.CreateVectorSplat(BaseTy->getElementCount(), IsNegative); + Value *Powi = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_select, + {IsNegativeVec, Recip, NewRes, EVL}); + II->replaceAllUsesWith(Powi); + II->eraseFromParent(); +} + +static bool runImpl(Function &F) { + SmallVector Replace; + for (auto &I : instructions(F)) { + if (auto *II = dyn_cast(&I)) { + // TODO: Add cost model to select small fixed vectors llvm.powi. + if (II->getIntrinsicID() == Intrinsic::vp_powi || + (II->getIntrinsicID() == Intrinsic::powi && + isa(II->getType()))) + Replace.push_back(II); + } + } + + if (Replace.empty()) + return false; + + for (IntrinsicInst *II : Replace) + expandPowi(II); + + return true; +} + +namespace { +class ExpandPowiLegacyPass : public FunctionPass { +public: + static char ID; + + ExpandPowiLegacyPass() : FunctionPass(ID) { + initializeExpandPowiLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { return runImpl(F); } +}; +} // namespace + +char ExpandPowiLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(ExpandPowiLegacyPass, "expand-powi", + "Expand powi functions", false, false) +INITIALIZE_PASS_END(ExpandPowiLegacyPass, "expand-powi", + "Expand powi functions", false, false) + +FunctionPass *llvm::createExpandPowiPass() { + return new ExpandPowiLegacyPass(); +} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1088,6 +1088,7 @@ PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); addPass(createExpandLargeDivRemPass()); addPass(createExpandLargeFpConvertPass()); + addPass(createExpandPowiPass()); addIRPasses(); addCodeGenPrepare(); addPassesToHandleExceptions(); diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -631,6 +631,10 @@ VPFunc = Intrinsic::getDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType()}); break; + case Intrinsic::vp_powi: + VPFunc = Intrinsic::getDeclaration( + M, VPID, {Params[0]->getType(), Params[1]->getType()}); + break; } assert(VPFunc && "Could not declare VP intrinsic"); return VPFunc; diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -17,6 +17,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -21,6 +21,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -28,6 +28,7 @@ ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Expand large div/rem ; GCN-O0-NEXT: Expand large fp convert +; GCN-O0-NEXT: Expand powi functions ; GCN-O0-NEXT: AMDGPU Printf lowering ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Dominator Tree Construction @@ -178,6 +179,7 @@ ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Expand large div/rem ; GCN-O1-NEXT: Expand large fp convert +; GCN-O1-NEXT: Expand powi functions ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Dominator Tree Construction @@ -453,6 +455,7 @@ ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Expand large div/rem ; GCN-O1-OPTS-NEXT: Expand large fp convert +; GCN-O1-OPTS-NEXT: Expand powi functions ; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -760,6 +763,7 @@ ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Expand large div/rem ; GCN-O2-NEXT: Expand large fp convert +; GCN-O2-NEXT: Expand powi functions ; GCN-O2-NEXT: AMDGPU Printf lowering ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Dominator Tree Construction @@ -1070,6 +1074,7 @@ ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Expand large div/rem ; GCN-O3-NEXT: Expand large fp convert +; GCN-O3-NEXT: Expand powi functions ; GCN-O3-NEXT: AMDGPU Printf lowering ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -7,6 +7,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/Generic/expand-powi.ll b/llvm/test/CodeGen/Generic/expand-powi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Generic/expand-powi.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -expand-powi -S < %s | FileCheck %s +declare @llvm.vp.powi.nxv1f32.i32(, i32, , i32) +define @foo( %a, i32 %b, %m, i32 %evl) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[POWI_EXPANSION_LOOP:%.*]] +; CHECK: powi-expansion-loop: +; CHECK-NEXT: [[BASE:%.*]] = phi [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[POWI_EXPANSION_LOOP]] ] +; CHECK-NEXT: [[EXP:%.*]] = phi i32 [ [[B:%.*]], [[ENTRY]] ], [ [[TMP4:%.*]], [[POWI_EXPANSION_LOOP]] ] +; CHECK-NEXT: [[RES:%.*]] = phi [ shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[ENTRY]] ], [ [[TMP3:%.*]], [[POWI_EXPANSION_LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.vp.fmul.nxv1f32( [[RES]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[EXP]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i1 [[TMP2]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP3]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT]], [[TMP0]], [[RES]], i32 [[EVL]]) +; CHECK-NEXT: [[TMP4]] = lshr i32 [[EXP]], 1 +; CHECK-NEXT: [[TMP5]] = call @llvm.vp.fmul.nxv1f32( [[BASE]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[EVL]]) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[TMP6]], label [[POWI_POST_LOOP:%.*]], label [[POWI_EXPANSION_LOOP]] +; CHECK: powi-post-loop: +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vp.fdiv.nxv1f32( shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[TMP3]], [[M:%.*]], i32 [[EVL]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i32 [[B]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i1 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT2]], [[TMP7]], [[TMP3]], i32 [[EVL]]) +; CHECK-NEXT: ret [[TMP9]] +; +entry: + %0 = call @llvm.vp.powi.nxv1f32.i32( %a, i32 %b, %m, i32 %evl) + ret %0 +} + +declare @llvm.powi.nxv1f32.i32(, i32) +define @foo2( %a, i32 %b) { +; CHECK-LABEL: @foo2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[TMP0]], 1 +; CHECK-NEXT: br label [[POWI_EXPANSION_LOOP:%.*]] +; CHECK: powi-expansion-loop: +; CHECK-NEXT: [[BASE:%.*]] = phi [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[POWI_EXPANSION_LOOP]] ] +; CHECK-NEXT: [[EXP:%.*]] = phi i32 [ [[B:%.*]], [[ENTRY]] ], [ [[TMP5:%.*]], [[POWI_EXPANSION_LOOP]] ] +; CHECK-NEXT: [[RES:%.*]] = phi [ shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[ENTRY]] ], [ [[TMP4:%.*]], [[POWI_EXPANSION_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vp.fmul.nxv1f32( [[RES]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[SCALABLE_SIZE]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EXP]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT]], [[TMP1]], [[RES]], i32 [[SCALABLE_SIZE]]) +; CHECK-NEXT: [[TMP5]] = lshr i32 [[EXP]], 1 +; CHECK-NEXT: [[TMP6]] = call @llvm.vp.fmul.nxv1f32( [[BASE]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[SCALABLE_SIZE]]) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP5]], 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[POWI_POST_LOOP:%.*]], label [[POWI_EXPANSION_LOOP]] +; CHECK: powi-post-loop: +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vp.fdiv.nxv1f32( shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[TMP4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[SCALABLE_SIZE]]) +; CHECK-NEXT: [[TMP9:%.*]] = icmp slt i32 [[B]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i1 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT2]], [[TMP8]], [[TMP4]], i32 [[SCALABLE_SIZE]]) +; CHECK-NEXT: ret [[TMP10]] +; +entry: + %0 = call @llvm.powi.nxv1f32.i32( %a, i32 %b) + ret %0 +} diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll --- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll @@ -21,6 +21,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll --- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll @@ -33,6 +33,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll --- a/llvm/test/CodeGen/M68k/pipeline.ll +++ b/llvm/test/CodeGen/M68k/pipeline.ll @@ -4,6 +4,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll --- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll @@ -18,6 +18,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: PPC Lower MASS Entries ; CHECK-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -21,6 +21,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Convert i1 constants to i32/i64 if they are returned ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: PPC Lower MASS Entries diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll @@ -21,6 +21,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -25,6 +25,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-powi.ll b/llvm/test/CodeGen/RISCV/rvv/expand-powi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/expand-powi.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 + +declare @llvm.vp.powi.nxv1f32.i32(, i32, , i32) +define @foo( %a, i32 %b, %m, i32 %evl) { +; RV32-LABEL: foo: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vmv1r.v v9, v0 +; RV32-NEXT: lui a2, 260096 +; RV32-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a2 +; RV32-NEXT: li a2, 1 +; RV32-NEXT: mv a3, a0 +; RV32-NEXT: .LBB0_1: # %powi-expansion-loop +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: andi a4, a3, 1 +; RV32-NEXT: vsetvli a5, zero, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.x v11, a4 +; RV32-NEXT: vmsne.vi v0, v11, 0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vfmul.vv v10, v10, v8, v0.t +; RV32-NEXT: srli a3, a3, 1 +; RV32-NEXT: vfmul.vv v8, v8, v8 +; RV32-NEXT: bne a3, a2, .LBB0_1 +; RV32-NEXT: # %bb.2: # %powi-post-loop +; RV32-NEXT: lui a2, 260096 +; RV32-NEXT: fmv.w.x ft0, a2 +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v0, v9 +; RV32-NEXT: vfrdiv.vf v8, v10, ft0, v0.t +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vmsne.vi v0, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: foo: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vmv1r.v v9, v0 +; RV64-NEXT: lui a2, 260096 +; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a2 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: li a2, 1 +; RV64-NEXT: mv a3, a0 +; RV64-NEXT: .LBB0_1: # %powi-expansion-loop +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: andi a4, a3, 1 +; RV64-NEXT: vsetvli a5, zero, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.x v11, a4 +; RV64-NEXT: vmsne.vi v0, v11, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vfmul.vv v10, v10, v8, v0.t +; RV64-NEXT: srliw a3, a3, 1 +; RV64-NEXT: vfmul.vv v8, v8, v8 +; RV64-NEXT: bne a3, a2, .LBB0_1 +; RV64-NEXT: # %bb.2: # %powi-post-loop +; RV64-NEXT: lui a2, 260096 +; RV64-NEXT: fmv.w.x ft0, a2 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v0, v9 +; RV64-NEXT: vfrdiv.vf v8, v10, ft0, v0.t +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: slti a0, a0, 0 +; RV64-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vmsne.vi v0, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV64-NEXT: ret +entry: + %0 = call @llvm.vp.powi.nxv1f32.i32( %a, i32 %b, %m, i32 %evl) + ret %0 +} + +declare @llvm.powi.nxv1f32.i32(, i32) +define @foo2( %a, i32 %b) { +; RV32-LABEL: foo2: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vmv1r.v v9, v8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: lui a2, 260096 +; RV32-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: li a2, 1 +; RV32-NEXT: mv a3, a0 +; RV32-NEXT: .LBB1_1: # %powi-expansion-loop +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: andi a4, a3, 1 +; RV32-NEXT: vsetvli a5, zero, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.x v10, a4 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vfmul.vv v8, v8, v9, v0.t +; RV32-NEXT: srli a3, a3, 1 +; RV32-NEXT: vfmul.vv v9, v9, v9 +; RV32-NEXT: bne a3, a2, .LBB1_1 +; RV32-NEXT: # %bb.2: # %powi-post-loop +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vmsne.vi v0, v9, 0 +; RV32-NEXT: lui a0, 260096 +; RV32-NEXT: fmv.w.x ft0, a0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vfrdiv.vf v8, v8, ft0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: foo2: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vmv1r.v v9, v8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: lui a2, 260096 +; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: li a2, 1 +; RV64-NEXT: mv a3, a0 +; RV64-NEXT: .LBB1_1: # %powi-expansion-loop +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: andi a4, a3, 1 +; RV64-NEXT: vsetvli a5, zero, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.x v10, a4 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vfmul.vv v8, v8, v9, v0.t +; RV64-NEXT: srliw a3, a3, 1 +; RV64-NEXT: vfmul.vv v9, v9, v9 +; RV64-NEXT: bne a3, a2, .LBB1_1 +; RV64-NEXT: # %bb.2: # %powi-post-loop +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: slti a0, a0, 0 +; RV64-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vmsne.vi v0, v9, 0 +; RV64-NEXT: lui a0, 260096 +; RV64-NEXT: fmv.w.x ft0, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vfrdiv.vf v8, v8, ft0, v0.t +; RV64-NEXT: ret +entry: + %0 = call @llvm.powi.nxv1f32.i32( %a, i32 %b) + ret %0 +} diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -19,6 +19,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -29,6 +29,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand large fp convert +; CHECK-NEXT: Expand powi functions ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -363,6 +363,7 @@ initializeConstantHoistingLegacyPassPass(*Registry); initializeScalarOpts(*Registry); initializeVectorization(*Registry); + initializeExpandPowiLegacyPassPass(*Registry); initializeScalarizeMaskedMemIntrinLegacyPassPass(*Registry); initializeExpandReductionsPass(*Registry); initializeExpandVectorPredicationPass(*Registry); diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -394,6 +394,7 @@ "fix-irreducible", "expand-large-fp-convert", "callbrprepare", + "expand-powi", }; for (const auto &P : PassNamePrefix) if (Pass.startswith(P)) @@ -443,6 +444,7 @@ initializeExpandLargeDivRemLegacyPassPass(Registry); initializeExpandLargeFpConvertLegacyPassPass(Registry); initializeExpandMemCmpPassPass(Registry); + initializeExpandPowiLegacyPassPass(Registry); initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry); initializeSelectOptimizePass(Registry); initializeCallBrPreparePass(Registry); diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -77,6 +77,8 @@ "i32)"; Str << " declare <8 x float> @llvm.vp.sqrt.v8f32(<8 x float>, <8 x i1>, " "i32)"; + Str << " declare <8 x float> @llvm.vp.powi.v8f32.i32(<8 x float>, i32, " + "<8 x i1>, i32)"; Str << " declare <8 x float> @llvm.vp.fma.v8f32(<8 x float>, <8 x float>, " "<8 x float>, <8 x i1>, i32) "; Str << " declare <8 x float> @llvm.vp.fmuladd.v8f32(<8 x float>, "