diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h --- a/llvm/lib/Target/VE/VECustomDAG.h +++ b/llvm/lib/Target/VE/VECustomDAG.h @@ -25,6 +25,8 @@ bool isVVPBinaryOp(unsigned Opcode); +bool isVVPTernaryOp(unsigned Opcode); + MVT splitVectorType(MVT VT); bool isPackedVectorType(EVT SomeVT); diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp --- a/llvm/lib/Target/VE/VECustomDAG.cpp +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -138,6 +138,16 @@ return false; } +bool isVVPTernaryOp(unsigned VVPOpcode) { + switch (VVPOpcode) { +#define ADD_TERNARY_VVP_OP(VVPNAME, ...) \ + case VEISD::VVPNAME: \ + return true; +#include "VVPNodes.def" + } + return false; +} + // Return the AVL operand position for this VVP or VEC Op. Optional getAVLPos(unsigned Opc) { // This is only available for VP SDNodes diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -81,19 +81,30 @@ return CDAG.getNode(VVPOpcode, LegalVecVT, {Op->getOperand(0), Op->getOperand(1), Mask, AVL}); } - if (VVPOpcode == VEISD::VVP_SELECT) { + switch (VVPOpcode) { + default: + llvm_unreachable("lowerToVVP called for unexpected SDNode."); + case VEISD::VVP_FFMA: { + // VE has a swizzled operand order in FMA (compared to LLVM IR and + // SDNodes). + auto X = Op->getOperand(2); + auto Y = Op->getOperand(0); + auto Z = Op->getOperand(1); + return CDAG.getNode(VVPOpcode, LegalVecVT, {X, Y, Z, Mask, AVL}); + } + case VEISD::VVP_SELECT: { auto Mask = Op->getOperand(0); auto OnTrue = Op->getOperand(1); auto OnFalse = Op->getOperand(2); return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL}); } - if (VVPOpcode == VEISD::VVP_SETCC) { + case VEISD::VVP_SETCC: { auto LHS = Op->getOperand(0); auto RHS = Op->getOperand(1); auto Pred = Op->getOperand(2); return CDAG.getNode(VVPOpcode, LegalVecVT, {LHS, RHS, Pred, Mask, AVL}); } - llvm_unreachable("lowerToVVP called for unexpected SDNode."); + } } SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op, diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td --- a/llvm/lib/Target/VE/VVPInstrInfo.td +++ b/llvm/lib/Target/VE/VVPInstrInfo.td @@ -57,6 +57,17 @@ IsVLVT<4> ]>; +// TernaryFPOp(x,y,z,mask,vl) +def SDTFPTernaryOpVVP : SDTypeProfile<1, 5, [ + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisFP<0>, + SDTCisInt<4>, + SDTCisSameNumEltsAs<0, 4>, + IsVLVT<5> +]>; + // Select(OnTrue, OnFalse, SelMask, vl) def SDTSelectVVP : SDTypeProfile<1, 4, [ // vp_select, vp_merge SDTCisVec<0>, @@ -86,6 +97,12 @@ [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen), (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>; +class vvp_fma_commutative : + PatFrags< + (ops node:$X, node:$Y, node:$Z, node:$mask, node:$vlen), + [(RootOp node:$X, node:$Y, node:$Z, node:$mask, node:$vlen), + (RootOp node:$X, node:$Z, node:$Y, node:$mask, node:$vlen)]>; + // VVP node definitions. def vvp_add : SDNode<"VEISD::VVP_ADD", SDTIntBinOpVVP>; def c_vvp_add : vvp_commutative; @@ -118,6 +135,9 @@ def c_vvp_fmul : vvp_commutative; def vvp_fdiv : SDNode<"VEISD::VVP_FDIV", SDTFPBinOpVVP>; +def vvp_ffma : SDNode<"VEISD::VVP_FFMA", SDTFPTernaryOpVVP>; +def c_vvp_ffma : vvp_fma_commutative; + // } Binary Operators def vvp_load : SDNode<"VEISD::VVP_LOAD", SDTLoadVVP, diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td --- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td @@ -341,6 +341,122 @@ defm : Binary_rv_vv; +multiclass Ternary_vvv< + SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, + ValueType MaskVT, string OpBaseName> { + //// Mask + vvp_select ///// + // with mask + def : Pat<(vvp_select + (OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz, + (MaskVT srcvalue), (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$avl), + (!cast(OpBaseName#"vvvml_v") + $vx, $vy, $vz, $mask, $avl, $vfalse)>; + + ///// w/o Mask ///// + // w/o mask + def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz, + (MaskVT true_mask), i32:$avl), + (!cast(OpBaseName#"vvvl") + $vx, $vy, $vz, $avl)>; + + ///// Mask ///// + // with mask + def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz, + MaskVT:$mask, i32:$avl), + (!cast(OpBaseName#"vvvml") + $vx, $vy, $vz, $mask, $avl)>; +} + +multiclass Ternary_rvv< + SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, + ValueType MaskVT, string OpBaseName> { + // Masked with passthru, broadcast first. + def : Pat<(vvp_select + (OpNode + (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz, + (MaskVT srcvalue), (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$avl), + (!cast(OpBaseName#"rvvml_v") + $sx, $vy, $vz, $mask, $avl, $vfalse)>; + + // Unmasked, broadcast first. + def : Pat<(OpNode + (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz, + (MaskVT true_mask), i32:$avl), + (!cast(OpBaseName#"rvvl") + $sx, $vy, $vz, $avl)>; + + // Masked, broadcast first. + def : Pat<(OpNode + (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz, + MaskVT:$mask, i32:$avl), + (!cast(OpBaseName#"rvvml") + $sx, $vy, $vz, $mask, $avl)>; +} + +multiclass Ternary_vrv< + SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, + ValueType MaskVT, string OpBaseName> { + // Masked with passthru, broadcast second. + def : Pat<(vvp_select + (OpNode + DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz, + (MaskVT srcvalue), (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$avl), + (!cast(OpBaseName#"vrvml_v") + $vx, $sy, $vz, + $mask, $avl, $vfalse)>; + + // Unmasked, broadcast second. + def : Pat<(OpNode + DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz, + (MaskVT true_mask), i32:$avl), + (!cast(OpBaseName#"vrvl") + $vx, $sy, $vz, $avl)>; + + // Masked, broadcast second. + def : Pat<(OpNode + DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz, + MaskVT:$mask, i32:$avl), + (!cast(OpBaseName#"vrvml") + $vx, $sy, $vz, $mask, $avl)>; +} + +multiclass Ternary_rvv_vrv_vvv< + SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, + ValueType MaskVT, string OpBaseName> { + defm : Ternary_rvv; + defm : Ternary_vrv; + defm : Ternary_vvv; +} + +// Expand both 64bit and 32 bit variant (256 elements) +multiclass Ternary_ShortLong< + SDPatternOperator OpNode, + ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName, + ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> { + defm : Ternary_rvv_vrv_vvv; + defm : Ternary_rvv_vrv_vvv; +} + +defm : Ternary_ShortLong; +defm : Ternary_rvv_vrv_vvv; + multiclass Merge_mvv< SDPatternOperator OpNode, ValueType DataVT, ValueType MaskVT, diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def --- a/llvm/lib/Target/VE/VVPNodes.def +++ b/llvm/lib/Target/VE/VVPNodes.def @@ -33,6 +33,14 @@ HANDLE_VP_TO_VVP(VPNAME, VVPNAME) #endif +/// ADD_TERNARY_VVP_OP(VVPNAME,SDNAME) +/// \p VVPName is a VVP Ternary operator. +/// \p SDNAME is the generic SD opcode corresponding to \p VVPName. +#ifndef ADD_TERNARY_VVP_OP +#define ADD_TERNARY_VVP_OP(VVPNAME,SDNAME) \ + ADD_VVP_OP(VVPNAME,SDNAME) +#endif + #ifndef ADD_BINARY_VVP_OP_COMPACT #define ADD_BINARY_VVP_OP_COMPACT(NAME) \ ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME) @@ -68,6 +76,8 @@ ADD_BINARY_VVP_OP_COMPACT(FMUL) REGISTER_PACKED(VVP_FMUL) ADD_BINARY_VVP_OP_COMPACT(FDIV) +ADD_TERNARY_VVP_OP(VVP_FFMA,FMA) HANDLE_VP_TO_VVP(VP_FMA, VVP_FFMA) REGISTER_PACKED(VVP_FFMA) + ADD_VVP_OP(VVP_SETCC, SETCC) // Shuffles. @@ -76,6 +86,7 @@ HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT) #undef ADD_BINARY_VVP_OP +#undef ADD_TERNARY_VVP_OP #undef ADD_BINARY_VVP_OP_COMPACT #undef ADD_VVP_OP #undef HANDLE_VP_TO_VVP diff --git a/llvm/test/CodeGen/VE/Vector/vec_fma.ll b/llvm/test/CodeGen/VE/Vector/vec_fma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_fma.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.fma.v256f32(<256 x float>, <256 x float>, <256 x float>) + +define fastcc <256 x float> @test_vec_fma_v256f32_vvv(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2) { +; CHECK-LABEL: test_vec_fma_v256f32_vvv: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfmad.s %v0, %v2, %v0, %v1 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vec_fma_v256f32_rvv(float %s0, <256 x float> %i1, <256 x float> %i2) { +; CHECK-LABEL: test_vec_fma_v256f32_rvv: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v0, %v1, %s0, %v0 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x float> undef, float %s0, i32 0 + %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer + %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vec_fma_v256f32_vrv(<256 x float> %i0, float %s1, <256 x float> %i2) { +; CHECK-LABEL: test_vec_fma_v256f32_vrv: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v0, %v1, %s0, %v0 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x float> undef, float %s1, i32 0 + %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer + %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vec_fma_v256f32_vvr(<256 x float> %i0, <256 x float> %i1, float %s2) { +; CHECK-LABEL: test_vec_fma_v256f32_vvr: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v0, %s0, %v0, %v1 +; CHECK-NEXT: b.l.t (, %s10) + %zins = insertelement <256 x float> undef, float %s2, i32 0 + %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer + %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2) + ret <256 x float> %r0 +} + +declare <256 x double> @llvm.fma.v256f64(<256 x double>, <256 x double>, <256 x double>) + +define fastcc <256 x double> @test_vec_fma_v256f64_vvv(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2) { +; CHECK-LABEL: test_vec_fma_v256f64_vvv: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfmad.d %v0, %v2, %v0, %v1 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vec_fma_v256f64_rvv(double %s0, <256 x double> %i1, <256 x double> %i2) { +; CHECK-LABEL: test_vec_fma_v256f64_rvv: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.d %v0, %v1, %s0, %v0 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x double> undef, double %s0, i32 0 + %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer + %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vec_fma_v256f64_vrv(<256 x double> %i0, double %s1, <256 x double> %i2) { +; CHECK-LABEL: test_vec_fma_v256f64_vrv: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.d %v0, %v1, %s0, %v0 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x double> undef, double %s1, i32 0 + %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer + %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vec_fma_v256f64_vvr(<256 x double> %i0, <256 x double> %i1, double %s2) { +; CHECK-LABEL: test_vec_fma_v256f64_vvr: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.d %v0, %s0, %v0, %v1 +; CHECK-NEXT: b.l.t (, %s10) + %zins = insertelement <256 x double> undef, double %s2, i32 0 + %i2 = shufflevector <256 x double> %zins, <256 x double> undef, <256 x i32> zeroinitializer + %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2) + ret <256 x double> %r0 +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_fma.ll b/llvm/test/CodeGen/VE/Vector/vp_fma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_fma.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.vp.fma.v256f32(<256 x float>, <256 x float>, <256 x float>, <256 x i1>, i32) + +define fastcc <256 x float> @test_vp_fma_v256f32_vvv(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f32_vvv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfmad.s %v0, %v2, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fma_v256f32_rvv(float %s0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f32_rvv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v0, %v1, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x float> undef, float %s0, i32 0 + %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer + %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fma_v256f32_vrv(<256 x float> %i0, float %s1, <256 x float> %i2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f32_vrv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v0, %v1, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x float> undef, float %s1, i32 0 + %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer + %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fma_v256f32_vvr(<256 x float> %i0, <256 x float> %i1, float %s2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f32_vvr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v0, %s0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %zins = insertelement <256 x float> undef, float %s2, i32 0 + %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer + %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + ret <256 x float> %r0 +} + +declare <256 x double> @llvm.vp.fma.v256f64(<256 x double>, <256 x double>, <256 x double>, <256 x i1>, i32) + +define fastcc <256 x double> @test_vp_fma_v256f64_vvv(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f64_vvv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfmad.d %v0, %v2, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fma_v256f64_rvv(double %s0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f64_rvv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.d %v0, %v1, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x double> undef, double %s0, i32 0 + %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer + %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fma_v256f64_vrv(<256 x double> %i0, double %s1, <256 x double> %i2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f64_vrv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.d %v0, %v1, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x double> undef, double %s1, i32 0 + %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer + %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fma_v256f64_vvr(<256 x double> %i0, <256 x double> %i1, double %s2, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fma_v256f64_vvr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.d %v0, %s0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %zins = insertelement <256 x double> undef, double %s2, i32 0 + %i2 = shufflevector <256 x double> %zins, <256 x double> undef, <256 x i32> zeroinitializer + %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) + ret <256 x double> %r0 +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32) +declare <256 x float> @llvm.vp.fma.v256f32(<256 x float>, <256 x float>, <256 x float>, <256 x i1>, i32) + +define fastcc <256 x float> @test_vp_fma_v256f32_vvv_merge(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) { +; CHECK-LABEL: test_vp_fma_v256f32_vvv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfmad.s %v3, %v2, %v0, %v1, %vm1 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v3 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fma_v256f32_rvv_merge(float %s0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) { +; CHECK-LABEL: test_vp_fma_v256f32_rvv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v2, %v1, %s0, %v0, %vm1 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v2 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x float> undef, float %s0, i32 0 + %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fma_v256f32_vrv_merge(<256 x float> %i0, float %s1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) { +; CHECK-LABEL: test_vp_fma_v256f32_vrv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v2, %v1, %s0, %v0, %vm1 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v2 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x float> undef, float %s1, i32 0 + %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fma_v256f32_vvr_merge(<256 x float> %i0, <256 x float> %i1, float %s2, <256 x i1> %m, i32 %n, <256 x float> %passthru) { +; CHECK-LABEL: test_vp_fma_v256f32_vvr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmad.s %v2, %s0, %v0, %v1, %vm1 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v2 +; CHECK-NEXT: b.l.t (, %s10) + %zins = insertelement <256 x float> undef, float %s2, i32 0 + %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +}