diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14405,6 +14405,8 @@
When specified with the fast-math-flag 'afn', the result may be approximated
using a less accurate calculation.
+.. _int_powi:
+
'``llvm.powi.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -20055,6 +20057,55 @@
%also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison
+.. _int_vp_powi:
+
+'``llvm.vp.powi.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <16 x float> @llvm.vp.sqrt.v16f32.i32 (<16 x float> , i32 , <16 x i1> , i32 )
+ declare @llvm.vp.sqrt.nxv4f32.i32 ( , i32 , , i32 )
+ declare <256 x double> @llvm.vp.sqrt.v256f64.i64 (<256 x double> , i64 , <256 x i1> , i32 )
+
+Overview:
+"""""""""
+
+Predicated floating-point square root of a vector of floating-point values.
+
+
+Arguments:
+""""""""""
+
+The first operand and the result have the same vector of floating-point type.
+The second oeprand is an integer power. The third operand is the vector mask and
+has the same number of elements as the result vector type. The fourth operand is
+the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.powi``' intrinsic performs floating-point powi (:ref:`powi `) of
+the first vector operand on each enabled lane with the second operand as
+exponent. The result on disabled lanes is a :ref:`poison value `.
+The operation is performed in the default floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %r = call <4 x float> @llvm.vp.powi.v4f32.i32(<4 x float> %a, i32 %b, <4 x i1> %mask, i32 %evl)
+ ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+ %t = call <4 x float> @llvm.powi.v4f32(<4 x float> %a, i32 %b)
+ %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison
+
+
.. _int_vp_fma:
'``llvm.vp.fma.*``' Intrinsics
diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -45,6 +45,7 @@
FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true))
FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass, ())
FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass, ())
+FUNCTION_PASS("expand-powi", ExpandPowiPass, ())
FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ())
FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -518,6 +518,9 @@
// Expands large div/rem instructions.
FunctionPass *createExpandLargeFpConvertPass();
+ // Expands powi instructions.
+ FunctionPass *createExpandPowiPass();
+
// This pass expands memcmp() to load/stores.
FunctionPass *createExpandMemCmpPass();
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1682,6 +1682,11 @@
[ LLVMMatchType<0>,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
llvm_i32_ty]>;
+ def int_vp_powi : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+ [ LLVMMatchType<0>,
+ llvm_anyint_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty]>;
// Casts
def int_vp_trunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -364,6 +364,9 @@
BEGIN_REGISTER_VP(vp_nearbyint, 1, 2, VP_FNEARBYINT, -1)
END_REGISTER_VP(vp_nearbyint, VP_FNEARBYINT)
+// llvm.vp.powi(x, y, mask,vlen)
+BEGIN_REGISTER_VP_INTRINSIC(vp_powi, 2, 3)
+END_REGISTER_VP_INTRINSIC(vp_powi)
///// } Floating-Point Arithmetic
///// Type Casts {
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -126,6 +126,7 @@
void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&);
void initializeExpandMemCmpPassPass(PassRegistry&);
void initializeExpandPostRAPass(PassRegistry&);
+void initializeExpandPowiLegacyPassPass(PassRegistry &);
void initializeExpandReductionsPass(PassRegistry&);
void initializeExpandVectorPredicationPass(PassRegistry &);
void initializeMakeGuardsExplicitLegacyPassPass(PassRegistry&);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -59,6 +59,7 @@
ExpandLargeFpConvert.cpp
ExpandMemCmp.cpp
ExpandPostRAPseudos.cpp
+ ExpandPowi.cpp
ExpandReductions.cpp
ExpandVectorPredication.cpp
FaultMaps.cpp
diff --git a/llvm/lib/CodeGen/ExpandPowi.cpp b/llvm/lib/CodeGen/ExpandPowi.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/CodeGen/ExpandPowi.cpp
@@ -0,0 +1,172 @@
+//===--- ExpandPowi.cpp - Expand Powi intrinsics ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for powi/vp.powi. The expansion is based on
+// compiler-rt/__powidf2.c.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "expand-powi"
+
+using namespace llvm;
+
+// Create Value of maximum effective vector length of I.
+static Value *getMaxEVL(Instruction *I) {
+ assert(isa(I->getType()) &&
+ "Only serve scalable llvm.powi now.");
+
+ ElementCount EC = cast(I->getType())->getElementCount();
+ IRBuilder<> Builder(I);
+ Type *Int32Ty = Type::getInt32Ty(I->getContext());
+ Value *VScale = Builder.CreateIntrinsic(Int32Ty, Intrinsic::vscale, {});
+ Value *FactorConst = Builder.getInt32(EC.getKnownMinValue());
+ return Builder.CreateMul(VScale, FactorConst, "scalable_size",
+ /*NUW*/ true, /*NSW*/ false);
+}
+
+// The expansion is based on the c code of compiler-rt/__powidf2.c,
+// const int recip = b < 0;
+// double r = 1;
+// while (1) {
+// if (b & 1)
+// r *= a;
+// b /= 2;
+// if (b == 0)
+// break;
+// a *= a;
+// }
+// return recip ? 1 / r : r;
+//
+// Expansion of llvm.powi still uses vp intrinsics here. It regards llvm.powi as
+// llvm.vp.powi with true mask and maximum vl.
+static void expandPowi(IntrinsicInst *II) {
+ LLVMContext &C = II->getContext();
+ Value *OrigBase = II->getOperand(0);
+ Value *OrigExp = II->getOperand(1);
+ VectorType *BaseTy = cast(OrigBase->getType());
+ Type *ExpTy = OrigExp->getType();
+ Type *CondTy = BaseTy->getWithNewType(Type::getInt1Ty(C));
+ Value *True = ConstantInt::get(CondTy, 1);
+ Value *Mask, *EVL;
+ if (II->getIntrinsicID() == Intrinsic::vp_powi) {
+ Mask = II->getOperand(2);
+ EVL = II->getOperand(3);
+ } else {
+ assert(II->getIntrinsicID() == Intrinsic::powi);
+ Mask = True;
+ EVL = getMaxEVL(II);
+ }
+
+ BasicBlock *PreLoopBB = II->getParent();
+ BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(II, "powi-post-loop");
+ BasicBlock *LoopBody =
+ BasicBlock::Create(PreLoopBB->getContext(), "powi-expansion-loop",
+ PreLoopBB->getParent(), PostLoopBB);
+
+ IRBuilder<> Builder(PreLoopBB->getTerminator());
+ Builder.CreateBr(LoopBody);
+ PreLoopBB->getTerminator()->eraseFromParent();
+
+ Builder.SetInsertPoint(LoopBody);
+ // Create phi of base.
+ PHINode *Base = Builder.CreatePHI(BaseTy, 2, "base");
+ Base->addIncoming(OrigBase, PreLoopBB);
+ // Create phi of exponent.
+ PHINode *Exp = Builder.CreatePHI(ExpTy, 2, "exp");
+ Exp->addIncoming(OrigExp, PreLoopBB);
+ // Create phi of res.
+ PHINode *Res = Builder.CreatePHI(BaseTy, 2, "res");
+ Res->addIncoming(ConstantFP::get(BaseTy, 1.), PreLoopBB);
+ // Res *= Base if Exp is odd.
+ Value *Tmp = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_fmul,
+ {Res, Base, True, EVL});
+ Value *And1 = Builder.CreateAnd(Exp, ConstantInt::get(ExpTy, 1));
+ Value *IsOdd = Builder.CreateICmpNE(And1, ConstantInt::get(ExpTy, 0));
+ Value *IsOddVec = Builder.CreateVectorSplat(BaseTy->getElementCount(), IsOdd);
+ Value *NewRes = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_select,
+ {IsOddVec, Tmp, Res, EVL});
+ Res->addIncoming(NewRes, LoopBody);
+ // Update Exp.
+ Value *NewExp = Builder.CreateLShr(Exp, ConstantInt::get(ExpTy, 1));
+ Exp->addIncoming(NewExp, LoopBody);
+ // Update Base.
+ Value *NewBase = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_fmul,
+ {Base, Base, True, EVL});
+ Base->addIncoming(NewBase, LoopBody);
+ // Check whether NewExp is zero.
+ Builder.CreateCondBr(Builder.CreateICmpEQ(NewExp, ConstantInt::get(ExpTy, 1)),
+ PostLoopBB, LoopBody);
+
+ Builder.SetInsertPoint(&PostLoopBB->front());
+ // Use reciprocal if power is negative.
+ Value *Recip =
+ Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_fdiv,
+ {ConstantFP::get(BaseTy, 1.), NewRes, Mask, EVL});
+ Value *IsNegative =
+ Builder.CreateICmpSLT(OrigExp, ConstantInt::get(ExpTy, 0));
+ Value *IsNegativeVec =
+ Builder.CreateVectorSplat(BaseTy->getElementCount(), IsNegative);
+ Value *Powi = Builder.CreateIntrinsic(BaseTy, Intrinsic::vp_select,
+ {IsNegativeVec, Recip, NewRes, EVL});
+ II->replaceAllUsesWith(Powi);
+ II->eraseFromParent();
+}
+
+static bool runImpl(Function &F) {
+ SmallVector Replace;
+ for (auto &I : instructions(F)) {
+ if (auto *II = dyn_cast(&I)) {
+ // TODO: Add cost model to select small fixed vectors llvm.powi.
+ if (II->getIntrinsicID() == Intrinsic::vp_powi ||
+ (II->getIntrinsicID() == Intrinsic::powi &&
+ isa(II->getType())))
+ Replace.push_back(II);
+ }
+ }
+
+ if (Replace.empty())
+ return false;
+
+ for (IntrinsicInst *II : Replace)
+ expandPowi(II);
+
+ return true;
+}
+
+namespace {
+class ExpandPowiLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ ExpandPowiLegacyPass() : FunctionPass(ID) {
+ initializeExpandPowiLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override { return runImpl(F); }
+};
+} // namespace
+
+char ExpandPowiLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ExpandPowiLegacyPass, "expand-powi",
+ "Expand powi functions", false, false)
+INITIALIZE_PASS_END(ExpandPowiLegacyPass, "expand-powi",
+ "Expand powi functions", false, false)
+
+FunctionPass *llvm::createExpandPowiPass() {
+ return new ExpandPowiLegacyPass();
+}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1088,6 +1088,7 @@
PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
addPass(createExpandLargeDivRemPass());
addPass(createExpandLargeFpConvertPass());
+ addPass(createExpandPowiPass());
addIRPasses();
addCodeGenPrepare();
addPassesToHandleExceptions();
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -631,6 +631,10 @@
VPFunc = Intrinsic::getDeclaration(
M, VPID, {Params[0]->getType(), Params[1]->getType()});
break;
+ case Intrinsic::vp_powi:
+ VPFunc = Intrinsic::getDeclaration(
+ M, VPID, {Params[0]->getType(), Params[1]->getType()});
+ break;
}
assert(VPFunc && "Could not declare VP intrinsic");
return VPFunc;
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -17,6 +17,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -21,6 +21,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: SVE intrinsics optimizations
; CHECK-NEXT: FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -28,6 +28,7 @@
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Expand large div/rem
; GCN-O0-NEXT: Expand large fp convert
+; GCN-O0-NEXT: Expand powi functions
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Dominator Tree Construction
@@ -178,6 +179,7 @@
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Expand large div/rem
; GCN-O1-NEXT: Expand large fp convert
+; GCN-O1-NEXT: Expand powi functions
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
@@ -453,6 +455,7 @@
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Expand large div/rem
; GCN-O1-OPTS-NEXT: Expand large fp convert
+; GCN-O1-OPTS-NEXT: Expand powi functions
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
@@ -760,6 +763,7 @@
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Expand large div/rem
; GCN-O2-NEXT: Expand large fp convert
+; GCN-O2-NEXT: Expand powi functions
; GCN-O2-NEXT: AMDGPU Printf lowering
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
@@ -1070,6 +1074,7 @@
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Expand large div/rem
; GCN-O3-NEXT: Expand large fp convert
+; GCN-O3-NEXT: Expand powi functions
; GCN-O3-NEXT: AMDGPU Printf lowering
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -7,6 +7,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Dominator Tree Construction
diff --git a/llvm/test/CodeGen/Generic/expand-powi.ll b/llvm/test/CodeGen/Generic/expand-powi.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/expand-powi.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -expand-powi -S < %s | FileCheck %s
+declare @llvm.vp.powi.nxv1f32.i32(, i32, , i32)
+define @foo( %a, i32 %b, %m, i32 %evl) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[POWI_EXPANSION_LOOP:%.*]]
+; CHECK: powi-expansion-loop:
+; CHECK-NEXT: [[BASE:%.*]] = phi [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[POWI_EXPANSION_LOOP]] ]
+; CHECK-NEXT: [[EXP:%.*]] = phi i32 [ [[B:%.*]], [[ENTRY]] ], [ [[TMP4:%.*]], [[POWI_EXPANSION_LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi [ shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[ENTRY]] ], [ [[TMP3:%.*]], [[POWI_EXPANSION_LOOP]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.vp.fmul.nxv1f32( [[RES]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[EVL:%.*]])
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[EXP]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i1 [[TMP2]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer
+; CHECK-NEXT: [[TMP3]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT]], [[TMP0]], [[RES]], i32 [[EVL]])
+; CHECK-NEXT: [[TMP4]] = lshr i32 [[EXP]], 1
+; CHECK-NEXT: [[TMP5]] = call @llvm.vp.fmul.nxv1f32( [[BASE]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[EVL]])
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], 1
+; CHECK-NEXT: br i1 [[TMP6]], label [[POWI_POST_LOOP:%.*]], label [[POWI_EXPANSION_LOOP]]
+; CHECK: powi-post-loop:
+; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vp.fdiv.nxv1f32( shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[TMP3]], [[M:%.*]], i32 [[EVL]])
+; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i32 [[B]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i1 [[TMP8]], i64 0
+; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT2]], [[TMP7]], [[TMP3]], i32 [[EVL]])
+; CHECK-NEXT: ret [[TMP9]]
+;
+entry:
+ %0 = call @llvm.vp.powi.nxv1f32.i32( %a, i32 %b, %m, i32 %evl)
+ ret %0
+}
+
+declare @llvm.powi.nxv1f32.i32(, i32)
+define @foo2( %a, i32 %b) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[TMP0]], 1
+; CHECK-NEXT: br label [[POWI_EXPANSION_LOOP:%.*]]
+; CHECK: powi-expansion-loop:
+; CHECK-NEXT: [[BASE:%.*]] = phi [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[POWI_EXPANSION_LOOP]] ]
+; CHECK-NEXT: [[EXP:%.*]] = phi i32 [ [[B:%.*]], [[ENTRY]] ], [ [[TMP5:%.*]], [[POWI_EXPANSION_LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi [ shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[ENTRY]] ], [ [[TMP4:%.*]], [[POWI_EXPANSION_LOOP]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vp.fmul.nxv1f32( [[RES]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EXP]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer
+; CHECK-NEXT: [[TMP4]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT]], [[TMP1]], [[RES]], i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT: [[TMP5]] = lshr i32 [[EXP]], 1
+; CHECK-NEXT: [[TMP6]] = call @llvm.vp.fmul.nxv1f32( [[BASE]], [[BASE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP5]], 1
+; CHECK-NEXT: br i1 [[TMP7]], label [[POWI_POST_LOOP:%.*]], label [[POWI_EXPANSION_LOOP]]
+; CHECK: powi-post-loop:
+; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vp.fdiv.nxv1f32( shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer), [[TMP4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT: [[TMP9:%.*]] = icmp slt i32 [[B]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i1 [[TMP9]], i64 0
+; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vp.select.nxv1f32( [[DOTSPLAT2]], [[TMP8]], [[TMP4]], i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT: ret [[TMP10]]
+;
+entry:
+ %0 = call @llvm.powi.nxv1f32.i32( %a, i32 %b)
+ ret %0
+}
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -21,6 +21,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -33,6 +33,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Dominator Tree Construction
diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll
--- a/llvm/test/CodeGen/M68k/pipeline.ll
+++ b/llvm/test/CodeGen/M68k/pipeline.ll
@@ -4,6 +4,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Dominator Tree Construction
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -18,6 +18,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: PPC Lower MASS Entries
; CHECK-NEXT: FunctionPass Manager
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -21,6 +21,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Convert i1 constants to i32/i64 if they are returned
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: PPC Lower MASS Entries
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -21,6 +21,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -25,6 +25,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-powi.ll b/llvm/test/CodeGen/RISCV/rvv/expand-powi.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-powi.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
+
+declare @llvm.vp.powi.nxv1f32.i32(, i32, , i32)
+define @foo( %a, i32 %b, %m, i32 %evl) {
+; RV32-LABEL: foo:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vmv1r.v v9, v0
+; RV32-NEXT: lui a2, 260096
+; RV32-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; RV32-NEXT: vmv.v.x v10, a2
+; RV32-NEXT: li a2, 1
+; RV32-NEXT: mv a3, a0
+; RV32-NEXT: .LBB0_1: # %powi-expansion-loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: andi a4, a3, 1
+; RV32-NEXT: vsetvli a5, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.x v11, a4
+; RV32-NEXT: vmsne.vi v0, v11, 0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu
+; RV32-NEXT: vfmul.vv v10, v10, v8, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vfmul.vv v8, v8, v8
+; RV32-NEXT: bne a3, a2, .LBB0_1
+; RV32-NEXT: # %bb.2: # %powi-post-loop
+; RV32-NEXT: lui a2, 260096
+; RV32-NEXT: fmv.w.x ft0, a2
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v0, v9
+; RV32-NEXT: vfrdiv.vf v8, v10, ft0, v0.t
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.x v9, a0
+; RV32-NEXT: vmsne.vi v0, v9, 0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v10, v8, v0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: foo:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vmv1r.v v9, v0
+; RV64-NEXT: lui a2, 260096
+; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a2
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: li a2, 1
+; RV64-NEXT: mv a3, a0
+; RV64-NEXT: .LBB0_1: # %powi-expansion-loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: andi a4, a3, 1
+; RV64-NEXT: vsetvli a5, zero, e8, mf8, ta, ma
+; RV64-NEXT: vmv.v.x v11, a4
+; RV64-NEXT: vmsne.vi v0, v11, 0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu
+; RV64-NEXT: vfmul.vv v10, v10, v8, v0.t
+; RV64-NEXT: srliw a3, a3, 1
+; RV64-NEXT: vfmul.vv v8, v8, v8
+; RV64-NEXT: bne a3, a2, .LBB0_1
+; RV64-NEXT: # %bb.2: # %powi-post-loop
+; RV64-NEXT: lui a2, 260096
+; RV64-NEXT: fmv.w.x ft0, a2
+; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v0, v9
+; RV64-NEXT: vfrdiv.vf v8, v10, ft0, v0.t
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: slti a0, a0, 0
+; RV64-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vmsne.vi v0, v9, 0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmerge.vvm v8, v10, v8, v0
+; RV64-NEXT: ret
+entry:
+ %0 = call @llvm.vp.powi.nxv1f32.i32( %a, i32 %b, %m, i32 %evl)
+ ret %0
+}
+
+declare @llvm.powi.nxv1f32.i32(, i32)
+define @foo2( %a, i32 %b) {
+; RV32-LABEL: foo2:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vmv1r.v v9, v8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: srli a1, a1, 3
+; RV32-NEXT: lui a2, 260096
+; RV32-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: li a2, 1
+; RV32-NEXT: mv a3, a0
+; RV32-NEXT: .LBB1_1: # %powi-expansion-loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: andi a4, a3, 1
+; RV32-NEXT: vsetvli a5, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.x v10, a4
+; RV32-NEXT: vmsne.vi v0, v10, 0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu
+; RV32-NEXT: vfmul.vv v8, v8, v9, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vfmul.vv v9, v9, v9
+; RV32-NEXT: bne a3, a2, .LBB1_1
+; RV32-NEXT: # %bb.2: # %powi-post-loop
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.x v9, a0
+; RV32-NEXT: vmsne.vi v0, v9, 0
+; RV32-NEXT: lui a0, 260096
+; RV32-NEXT: fmv.w.x ft0, a0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu
+; RV32-NEXT: vfrdiv.vf v8, v8, ft0, v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: foo2:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vmv1r.v v9, v8
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: srli a1, a1, 3
+; RV64-NEXT: lui a2, 260096
+; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a2
+; RV64-NEXT: li a2, 1
+; RV64-NEXT: mv a3, a0
+; RV64-NEXT: .LBB1_1: # %powi-expansion-loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: andi a4, a3, 1
+; RV64-NEXT: vsetvli a5, zero, e8, mf8, ta, ma
+; RV64-NEXT: vmv.v.x v10, a4
+; RV64-NEXT: vmsne.vi v0, v10, 0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu
+; RV64-NEXT: vfmul.vv v8, v8, v9, v0.t
+; RV64-NEXT: srliw a3, a3, 1
+; RV64-NEXT: vfmul.vv v9, v9, v9
+; RV64-NEXT: bne a3, a2, .LBB1_1
+; RV64-NEXT: # %bb.2: # %powi-post-loop
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: slti a0, a0, 0
+; RV64-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vmsne.vi v0, v9, 0
+; RV64-NEXT: lui a0, 260096
+; RV64-NEXT: fmv.w.x ft0, a0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu
+; RV64-NEXT: vfrdiv.vf v8, v8, ft0, v0.t
+; RV64-NEXT: ret
+entry:
+ %0 = call @llvm.powi.nxv1f32.i32( %a, i32 %b)
+ ret %0
+}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -19,6 +19,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX intrinsics
; CHECK-NEXT: Lower AMX type for load/store
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -29,6 +29,7 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand large fp convert
+; CHECK-NEXT: Expand powi functions
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX intrinsics
; CHECK-NEXT: Lower AMX type for load/store
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -363,6 +363,7 @@
initializeConstantHoistingLegacyPassPass(*Registry);
initializeScalarOpts(*Registry);
initializeVectorization(*Registry);
+ initializeExpandPowiLegacyPassPass(*Registry);
initializeScalarizeMaskedMemIntrinLegacyPassPass(*Registry);
initializeExpandReductionsPass(*Registry);
initializeExpandVectorPredicationPass(*Registry);
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -394,6 +394,7 @@
"fix-irreducible",
"expand-large-fp-convert",
"callbrprepare",
+ "expand-powi",
};
for (const auto &P : PassNamePrefix)
if (Pass.startswith(P))
@@ -443,6 +444,7 @@
initializeExpandLargeDivRemLegacyPassPass(Registry);
initializeExpandLargeFpConvertLegacyPassPass(Registry);
initializeExpandMemCmpPassPass(Registry);
+ initializeExpandPowiLegacyPassPass(Registry);
initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
initializeSelectOptimizePass(Registry);
initializeCallBrPreparePass(Registry);
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -77,6 +77,8 @@
"i32)";
Str << " declare <8 x float> @llvm.vp.sqrt.v8f32(<8 x float>, <8 x i1>, "
"i32)";
+ Str << " declare <8 x float> @llvm.vp.powi.v8f32.i32(<8 x float>, i32, "
+ "<8 x i1>, i32)";
Str << " declare <8 x float> @llvm.vp.fma.v8f32(<8 x float>, <8 x float>, "
"<8 x float>, <8 x i1>, i32) ";
Str << " declare <8 x float> @llvm.vp.fmuladd.v8f32(<8 x float>, "