diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -588,8 +588,17 @@ return getFunctionalOpcodeForVP(getIntrinsicID()); } + // Equivalent non-predicated constrained ID + std::optional getConstrainedIntrinsicID() const { + return getConstrainedIntrinsicIDForVP(getIntrinsicID()); + } + // Equivalent non-predicated opcode static std::optional getFunctionalOpcodeForVP(Intrinsic::ID ID); + + // Equivalent non-predicated constrained ID + static std::optional + getConstrainedIntrinsicIDForVP(Intrinsic::ID ID); }; /// This represents vector predication reduction intrinsics. diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -171,6 +171,10 @@ Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI); + /// Lower this VP fp call to a unpredicated fp call. + Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI, + unsigned UnpredicatedIntrinsicID); + /// Lower this VP reduction to a call to an unpredicated reduction intrinsic. Value *expandPredicationInReduction(IRBuilder<> &Builder, VPReductionIntrinsic &PI); @@ -271,6 +275,38 @@ return NewBinOp; } +Value *CachingVPExpander::expandPredicationToFPCall( + IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) { + assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && + "Implicitly dropping %evl in non-speculatable operator!"); + + switch (UnpredicatedIntrinsicID) { + case Intrinsic::fabs: + case Intrinsic::sqrt: { + Value *Op0 = VPI.getOperand(0); + Function *Fn = Intrinsic::getDeclaration( + VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); + Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName()); + replaceOperation(*NewOp, VPI); + return NewOp; + } + case Intrinsic::experimental_constrained_fma: + case Intrinsic::experimental_constrained_fmuladd: { + Value *Op0 = VPI.getOperand(0); + Value *Op1 = VPI.getOperand(1); + Value *Op2 = VPI.getOperand(2); + Function *Fn = Intrinsic::getDeclaration( + VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); + Value *NewOp = + Builder.CreateConstrainedFPCall(Fn, {Op0, Op1, Op2}, VPI.getName()); + replaceOperation(*NewOp, VPI); + return NewOp; + } + } + + return nullptr; +} + static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, Type *EltTy) { bool Negative = false; @@ -565,6 +601,10 @@ switch (VPI.getIntrinsicID()) { default: break; + case Intrinsic::vp_fabs: + return expandPredicationToFPCall(Builder, VPI, Intrinsic::fabs); + case Intrinsic::vp_sqrt: + return expandPredicationToFPCall(Builder, VPI, Intrinsic::sqrt); case Intrinsic::vp_load: case Intrinsic::vp_store: case Intrinsic::vp_gather: @@ -572,6 +612,10 @@ return expandPredicationInMemoryIntrinsic(Builder, VPI); } + if (auto CID = VPI.getConstrainedIntrinsicID()) + if (Value *Call = expandPredicationToFPCall(Builder, VPI, *CID)) + return Call; + return &VPI; } diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -529,6 +529,20 @@ return std::nullopt; } +// Equivalent non-predicated opcode +std::optional +VPIntrinsic::getConstrainedIntrinsicIDForVP(Intrinsic::ID ID) { + switch (ID) { + default: + break; +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: +#define VP_PROPERTY_CONSTRAINEDFP(HASRND, HASEXCEPT, CID) return Intrinsic::CID; +#define END_REGISTER_VP_INTRINSIC(VPID) break; +#include "llvm/IR/VPIntrinsics.def" + } + return std::nullopt; +} + Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) { switch (IROPC) { default: diff --git a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll @@ -156,21 +156,54 @@ } declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) -; TODO: @llvm.vp.fabs.v4f32 -;define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) -; store <4 x float> %res, ptr %out -; ret void -;} -;declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32) +define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; SSE-LABEL: vp_fabs_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vp_fabs_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vp_fabs_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: vp_fabs_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: retq + %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32) -; TODO: @llvm.vp.sqrt.v4f32 -;define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) -; store <4 x float> %res, ptr %out -; ret void -;} -;declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32) +define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; SSE-LABEL: vp_sqrt_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: vp_sqrt_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %xmm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rdi) +; AVX-NEXT: retq + %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32) ; TODO: @llvm.vp.fneg.v4f32 ;define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { @@ -180,23 +213,171 @@ ;} ;declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32) -; TODO: @llvm.vp.fma.v4f32 -;define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { -; %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> , i32 4) -; store <4 x float> %res, ptr %out -; ret void -;} -;declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) +define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { +; SSE-LABEL: vp_fma_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $64, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: callq fmaf@PLT +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: callq fmaf@PLT +; SSE-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: callq fmaf@PLT +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: callq fmaf@PLT +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-NEXT: movaps %xmm1, (%rbx) +; SSE-NEXT: addq $64, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX1-LABEL: vp_fma_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $48, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: callq fmaf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,1,3,3] +; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = mem[1,1,3,3] +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: callq fmaf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = mem[1,0] +; AVX1-NEXT: vmovapd %xmm1, %xmm2 +; AVX1-NEXT: callq fmaf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = mem[3,3,3,3] +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: callq fmaf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vmovaps %xmm0, (%rbx) +; AVX1-NEXT: addq $48, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: vp_fma_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $48, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, %xmm2 +; AVX2-NEXT: callq fmaf@PLT +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,1,3,3] +; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = mem[1,1,3,3] +; AVX2-NEXT: vmovaps %xmm1, %xmm2 +; AVX2-NEXT: callq fmaf@PLT +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = mem[1,0] +; AVX2-NEXT: vmovapd %xmm1, %xmm2 +; AVX2-NEXT: callq fmaf@PLT +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = mem[3,3,3,3] +; AVX2-NEXT: vmovaps %xmm1, %xmm2 +; AVX2-NEXT: callq fmaf@PLT +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vmovaps %xmm0, (%rbx) +; AVX2-NEXT: addq $48, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: vp_fma_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: vmovaps %xmm0, (%rdi) +; AVX512-NEXT: retq + %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> , i32 4) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) -; TODO: @llvm.vp.fmuladd.v4f32 -;define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { -; %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> , i32 4) -; store <4 x float> %res, ptr %out -; ret void -;} -;declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) +define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { +; SSE-LABEL: vp_fmuladd_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vp_fmuladd_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vp_fmuladd_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: vp_fmuladd_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: vmovaps %xmm0, (%rdi) +; AVX512-NEXT: retq + %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> , i32 4) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX1: {{.*}} -; AVX2: {{.*}} -; AVX512: {{.*}}