diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp --- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp +++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp @@ -54,6 +54,7 @@ static StringRef getCPUSuffix(const PPCSubtarget *Subtarget); static std::string createMASSVFuncName(Function &Func, const PPCSubtarget *Subtarget); + bool handlePowSpecialCases(CallInst *CI, Function &Func, Module &M); bool lowerMASSVCall(CallInst *CI, Function &Func, Module &M, const PPCSubtarget *Subtarget); }; @@ -96,6 +97,34 @@ return MASSVEntryName; } +/// If there are proper fast-math flags, this function creates llvm.pow +/// intrinsics when the exponent is 0.25 or 0.75. +bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func, + Module &M) { + if (Func.getName() != "__powf4_massv" && Func.getName() != "__powd2_massv") + return false; + + if (Constant *Exp = dyn_cast(CI->getArgOperand(1))) + if (ConstantFP *CFP = dyn_cast(Exp->getSplatValue())) { + // If the argument is 0.75 or 0.25 it is cheaper to turn it into pow + // intrinsic so that it could be optimzed as sequence of sqrt's. + if (!CI->hasNoInfs() || !CI->hasApproxFunc()) + return false; + + if (!CFP->isExactlyValue(0.75) && !CFP->isExactlyValue(0.25)) + return false; + + if (CFP->isExactlyValue(0.25) && !CI->hasNoSignedZeros()) + return false; + + CI->setCalledFunction( + Intrinsic::getDeclaration(&M, Intrinsic::pow, CI->getType())); + return true; + } + + return false; +} + /// Lowers generic MASSV entries to PowerPC subtarget-specific MASSV entries. /// e.g.: __sind2_massv --> __sind2_P9 for a Power9 subtarget. /// Both function prototypes and their callsites are updated during lowering. @@ -105,6 +134,10 @@ if (CI->use_empty()) return false; + // Handling pow(x, 0.25), pow(x, 0.75), powf(x, 0.25), powf(x, 0.75) + if (handlePowSpecialCases(CI, Func, M)) + return true; + std::string MASSVEntryName = createMASSVFuncName(Func, Subtarget); FunctionCallee FCache = M.getOrInsertFunction( MASSVEntryName, Func.getFunctionType(), Func.getAttributes()); diff --git a/llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll b/llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll @@ -0,0 +1,166 @@ +; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s +; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s + +; Exponent is a variable +define void @my_vpow_var(double* nocapture %z, double* nocapture readonly %y, double* nocapture readonly %x) { +; CHECK-LABEL: @vspow_var +; CHECK-PWR9: bl __powd2_P9 +; CHECK-PWR8: bl __powd2_P8 +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr double, double* %z, i64 %index + %next.gep31 = getelementptr double, double* %y, i64 %index + %next.gep32 = getelementptr double, double* %x, i64 %index + %0 = bitcast double* %next.gep32 to <2 x double>* + %wide.load = load <2 x double>, <2 x double>* %0, align 8 + %1 = bitcast double* %next.gep31 to <2 x double>* + %wide.load33 = load <2 x double>, <2 x double>* %1, align 8 + %2 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> %wide.load33) + %3 = bitcast double* %next.gep to <2 x double>* + store <2 x double> %2, <2 x double>* %3, align 8 + %index.next = add i64 %index, 2 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is a constant != 0.75 and !=0.25 +define void @my_vpow_const(double* nocapture %y, double* nocapture readonly %x) { +; CHECK-LABEL: @vspow_const +; CHECK-PWR9: bl __powd2_P9 +; CHECK-PWR8: bl __powd2_P8 +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr double, double* %y, i64 %index + %next.gep19 = getelementptr double, double* %x, i64 %index + %0 = bitcast double* %next.gep19 to <2 x double>* + %wide.load = load <2 x double>, <2 x double>* %0, align 8 + %1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> ) + %2 = bitcast double* %next.gep to <2 x double>* + store <2 x double> %1, <2 x double>* %2, align 8 + %index.next = add i64 %index, 2 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.75 +define void @my_vpow_075(double* nocapture %y, double* nocapture readonly %x) { +; CHECK-LABEL: @vspow_075 +; CHECK-NOT: bl __powd2_P{{[8,9]}} +; CHECK: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr double, double* %y, i64 %index + %next.gep19 = getelementptr double, double* %x, i64 %index + %0 = bitcast double* %next.gep19 to <2 x double>* + %wide.load = load <2 x double>, <2 x double>* %0, align 8 + %1 = call ninf afn <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> ) + %2 = bitcast double* %next.gep to <2 x double>* + store <2 x double> %1, <2 x double>* %2, align 8 + %index.next = add i64 %index, 2 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.25 +define void @my_vpow_025(double* nocapture %y, double* nocapture readonly %x) { +; CHECK-LABEL: @vspow_025 +; CHECK-NOT: bl __powd2_P{{[8,9]}} +; CHECK: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr double, double* %y, i64 %index + %next.gep19 = getelementptr double, double* %x, i64 %index + %0 = bitcast double* %next.gep19 to <2 x double>* + %wide.load = load <2 x double>, <2 x double>* %0, align 8 + %1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> ) + %2 = bitcast double* %next.gep to <2 x double>* + store <2 x double> %1, <2 x double>* %2, align 8 + %index.next = add i64 %index, 2 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.75 but no proper fast-math flags +define void @my_vpow_075_nofast(double* nocapture %y, double* nocapture readonly %x) { +; CHECK-LABEL: @vspow_075_nofast +; CHECK-PWR9: bl __powd2_P9 +; CHECK-PWR8: bl __powd2_P8 +; CHECK-NOT: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr double, double* %y, i64 %index + %next.gep19 = getelementptr double, double* %x, i64 %index + %0 = bitcast double* %next.gep19 to <2 x double>* + %wide.load = load <2 x double>, <2 x double>* %0, align 8 + %1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> ) + %2 = bitcast double* %next.gep to <2 x double>* + store <2 x double> %1, <2 x double>* %2, align 8 + %index.next = add i64 %index, 2 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.25 but no proper fast-math flags +define void @my_vpow_025_nofast(double* nocapture %y, double* nocapture readonly %x) { +; CHECK-LABEL: @vspow_025_nofast +; CHECK-PWR9: bl __powd2_P9 +; CHECK-PWR8: bl __powd2_P8 +; CHECK-NOT: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr double, double* %y, i64 %index + %next.gep19 = getelementptr double, double* %x, i64 %index + %0 = bitcast double* %next.gep19 to <2 x double>* + %wide.load = load <2 x double>, <2 x double>* %0, align 8 + %1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> ) + %2 = bitcast double* %next.gep to <2 x double>* + store <2 x double> %1, <2 x double>* %2, align 8 + %index.next = add i64 %index, 2 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare <2 x double> @__powd2_massv(<2 x double>, <2 x double>) #1 diff --git a/llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll b/llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll @@ -0,0 +1,166 @@ +; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s +; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s + +; Exponent is a variable +define void @vspow_var(float* nocapture %z, float* nocapture readonly %y, float* nocapture readonly %x) { +; CHECK-LABEL: @vspow_var +; CHECK-PWR9: bl __powf4_P9 +; CHECK-PWR8: bl __powf4_P8 +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr float, float* %z, i64 %index + %next.gep31 = getelementptr float, float* %y, i64 %index + %next.gep32 = getelementptr float, float* %x, i64 %index + %0 = bitcast float* %next.gep32 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %0, align 4 + %1 = bitcast float* %next.gep31 to <4 x float>* + %wide.load33 = load <4 x float>, <4 x float>* %1, align 4 + %2 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> %wide.load33) + %3 = bitcast float* %next.gep to <4 x float>* + store <4 x float> %2, <4 x float>* %3, align 4 + %index.next = add i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is a constant != 0.75 and !=0.25 +define void @vspow_const(float* nocapture %y, float* nocapture readonly %x) { +; CHECK-LABEL: @vspow_const +; CHECK-PWR9: bl __powf4_P9 +; CHECK-PWR8: bl __powf4_P8 +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr float, float* %y, i64 %index + %next.gep19 = getelementptr float, float* %x, i64 %index + %0 = bitcast float* %next.gep19 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %0, align 4 + %1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> ) + %2 = bitcast float* %next.gep to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + %index.next = add i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.75 +define void @vspow_075(float* nocapture %y, float* nocapture readonly %x) { +; CHECK-LABEL: @vspow_075 +; CHECK-NOT: bl __powf4_P{{[8,9]}} +; CHECK: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr float, float* %y, i64 %index + %next.gep19 = getelementptr float, float* %x, i64 %index + %0 = bitcast float* %next.gep19 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %0, align 4 + %1 = call ninf afn <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> ) + %2 = bitcast float* %next.gep to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + %index.next = add i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.25 +define void @vspow_025(float* nocapture %y, float* nocapture readonly %x) { +; CHECK-LABEL: @vspow_025 +; CHECK-NOT: bl __powf4_P{{[8,9]}} +; CHECK: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr float, float* %y, i64 %index + %next.gep19 = getelementptr float, float* %x, i64 %index + %0 = bitcast float* %next.gep19 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %0, align 4 + %1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> ) + %2 = bitcast float* %next.gep to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + %index.next = add i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.75 but no proper fast-math flags +define void @vspow_075_nofast(float* nocapture %y, float* nocapture readonly %x) { +; CHECK-LABEL: @vspow_075_nofast +; CHECK-PWR9: bl __powf4_P9 +; CHECK-PWR8: bl __powf4_P8 +; CHECK-NOT: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr float, float* %y, i64 %index + %next.gep19 = getelementptr float, float* %x, i64 %index + %0 = bitcast float* %next.gep19 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %0, align 4 + %1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> ) + %2 = bitcast float* %next.gep to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + %index.next = add i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Exponent is 0.25 but no proper fast-math flags +define void @vspow_025_nofast(float* nocapture %y, float* nocapture readonly %x) { +; CHECK-LABEL: @vspow_025_nofast +; CHECK-PWR9: bl __powf4_P9 +; CHECK-PWR8: bl __powf4_P8 +; CHECK-NOT: xvrsqrtesp +; CHECK: blr +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %next.gep = getelementptr float, float* %y, i64 %index + %next.gep19 = getelementptr float, float* %x, i64 %index + %0 = bitcast float* %next.gep19 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %0, align 4 + %1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> ) + %2 = bitcast float* %next.gep to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + %index.next = add i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: + ret void +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare <4 x float> @__powf4_massv(<4 x float>, <4 x float>)