diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h --- a/llvm/lib/Target/VE/VECustomDAG.h +++ b/llvm/lib/Target/VE/VECustomDAG.h @@ -25,6 +25,8 @@ bool isVVPBinaryOp(unsigned Opcode); +MVT splitVectorType(MVT VT); + bool isPackedVectorType(EVT SomeVT); bool isMaskType(EVT SomeVT); @@ -33,6 +35,10 @@ bool isVVPOrVEC(unsigned); +bool supportsPackedMode(unsigned Opcode, EVT IdiomVT); + +bool isPackingSupportOpcode(unsigned Opc); + bool maySafelyIgnoreMask(SDValue Op); /// The VE backend uses a two-staged process to lower and legalize vector @@ -71,6 +77,11 @@ // The AVL operand of this node. SDValue getNodeAVL(SDValue); +// Mask position of this node. +Optional getMaskPos(unsigned); + +SDValue getNodeMask(SDValue); + // Return the AVL operand of this node. If it is a LEGALAVL node, unwrap it. // Return with the boolean whether unwrapping happened. std::pair getAnnotatedNodeAVL(SDValue); @@ -93,6 +104,13 @@ Hi = 1 // Float (32, 0] }; +struct VETargetMasks { + SDValue Mask; + SDValue AVL; + VETargetMasks(SDValue Mask = SDValue(), SDValue AVL = SDValue()) + : Mask(Mask), AVL(AVL) {} +}; + class VECustomDAG { SelectionDAG &DAG; SDLoc DL; @@ -135,8 +153,8 @@ /// } getNode /// Packing { - SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL); - SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL); + SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL) const; + SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL) const; /// } Packing SDValue getConstant(uint64_t Val, EVT VT, bool IsTarget = false, @@ -148,6 +166,8 @@ // Wrap AVL in a LEGALAVL node (unless it is one already). SDValue annotateLegalAVL(SDValue AVL) const; + VETargetMasks getTargetSplitMask(SDValue RawMask, SDValue RawAVL, + PackElem Part) const; }; } // namespace llvm diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp --- a/llvm/lib/Target/VE/VECustomDAG.cpp +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -25,6 +25,12 @@ return SomeVT.getVectorNumElements() > StandardVectorWidth; } +MVT splitVectorType(MVT VT) { + if (!VT.isVector()) + return VT; + return MVT::getVectorVT(VT.getVectorElementType(), StandardVectorWidth); +} + MVT getLegalVectorType(Packing P, MVT ElemVT) { return MVT::getVectorVT(ElemVT, P == Packing::Normal ? StandardVectorWidth : PackedVectorWidth); @@ -83,6 +89,31 @@ } } +bool supportsPackedMode(unsigned Opcode, EVT IdiomVT) { + bool IsPackedOp = isPackedVectorType(IdiomVT); + bool IsMaskOp = isMaskType(IdiomVT); + switch (Opcode) { + default: + return false; + + case VEISD::VEC_BROADCAST: + return true; +#define REGISTER_PACKED(VVP_NAME) case VEISD::VVP_NAME: +#include "VVPNodes.def" + return IsPackedOp && !IsMaskOp; + } +} + +bool isPackingSupportOpcode(unsigned Opc) { + switch (Opc) { + case VEISD::VEC_PACK: + case VEISD::VEC_UNPACK_LO: + case VEISD::VEC_UNPACK_HI: + return true; + } + return false; +} + bool isVVPOrVEC(unsigned Opcode) { switch (Opcode) { case VEISD::VEC_BROADCAST: @@ -125,6 +156,25 @@ return None; } +Optional getMaskPos(unsigned Opc) { + // This is only available for VP SDNodes + auto PosOpt = ISD::getVPMaskIdx(Opc); + if (PosOpt) + return *PosOpt; + + // VVP Opcodes. + if (isVVPBinaryOp(Opc)) + return 2; + + // VM Opcodes. + switch (Opc) { + case VEISD::VVP_SELECT: + return 2; + } + + return None; +} + bool isLegalAVL(SDValue AVL) { return AVL->getOpcode() == VEISD::LEGALAVL; } SDValue getNodeAVL(SDValue Op) { @@ -132,6 +182,11 @@ return PosOpt ? Op->getOperand(*PosOpt) : SDValue(); } +SDValue getNodeMask(SDValue Op) { + auto PosOpt = getMaskPos(Op->getOpcode()); + return PosOpt ? Op->getOperand(*PosOpt) : SDValue(); +} + std::pair getAnnotatedNodeAVL(SDValue Op) { SDValue AVL = getNodeAVL(Op); if (!AVL) @@ -218,7 +273,7 @@ } SDValue VECustomDAG::getUnpack(EVT DestVT, SDValue Vec, PackElem Part, - SDValue AVL) { + SDValue AVL) const { // TODO: Peek through VEC_PACK and VEC_BROADCAST(REPL_ ..) operands. unsigned OC = (Part == PackElem::Lo) ? VEISD::VEC_UNPACK_LO : VEISD::VEC_UNPACK_HI; @@ -226,9 +281,32 @@ } SDValue VECustomDAG::getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, - SDValue AVL) { + SDValue AVL) const { // TODO: Peek through VEC_UNPACK_LO|HI operands. return DAG.getNode(VEISD::VEC_PACK, DL, DestVT, LoVec, HiVec, AVL); } +VETargetMasks VECustomDAG::getTargetSplitMask(SDValue RawMask, SDValue RawAVL, + PackElem Part) const { + // Adjust AVL for this part + SDValue NewAVL; + SDValue OneV = getConstant(1, MVT::i32); + if (Part == PackElem::Hi) + NewAVL = getNode(ISD::ADD, MVT::i32, {RawAVL, OneV}); + else + NewAVL = RawAVL; + NewAVL = getNode(ISD::SRL, MVT::i32, {NewAVL, OneV}); + + NewAVL = annotateLegalAVL(NewAVL); + + // Legalize Mask (unpack or all-true) + SDValue NewMask; + if (!RawMask) + NewMask = getConstantMask(Packing::Normal, true); + else + NewMask = getUnpack(MVT::v256i1, RawMask, Part, NewAVL); + + return VETargetMasks(NewMask, NewAVL); +} + } // namespace llvm diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -187,6 +187,7 @@ /// VVP Lowering { SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const; SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const; SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const; SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const; /// } VVPLowering diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -1681,6 +1681,9 @@ TargetLowering::LegalizeAction VETargetLowering::getCustomOperationAction(SDNode &Op) const { + if (isPackingSupportOpcode(Op.getOpcode())) + return Legal; + // Custom lower to legalize AVL for packed mode. if (isVVPOrVEC(Op.getOpcode())) return Custom; diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td --- a/llvm/lib/Target/VE/VEInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td @@ -125,3 +125,26 @@ (v512i1 (IMPLICIT_DEF)), $vlo, sub_vm_odd), $vhi, sub_vm_even)>; + +// v256.32 <> v512.32 +multiclass Packing { + // no-op unpacks + def : Pat<(v256i32 (vec_unpack_lo PackVT:$vp, (i32 srcvalue))), + (COPY_TO_REGCLASS $vp, V64)>; + def : Pat<(v256f32 (vec_unpack_hi PackVT:$vp, (i32 srcvalue))), + (COPY_TO_REGCLASS $vp, V64)>; + + // shuffle unpacks + def : Pat<(v256f32 (vec_unpack_lo PackVT:$vp, i32:$avl)), + (VSHFvvil $vp, $vp, 4, $avl)>; // always pick lo + def : Pat<(v256i32 (vec_unpack_hi PackVT:$vp, i32:$avl)), + (VSHFvvil $vp, $vp, 0, $avl)>; // always pick hi +} + +defm : Packing; +defm : Packing; + +def : Pat<(v512i32 (vec_pack v256i32:$vlo, v256i32:$vhi, i32:$avl)), + (VSHFvvil $vlo, $vhi, 13, $avl)>; +def : Pat<(v512f32 (vec_pack v256f32:$vlo, v256f32:$vhi, i32:$avl)), + (VSHFvvil $vlo, $vhi, 8, $avl)>; diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -21,10 +21,68 @@ SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const { VECustomDAG CDAG(DAG, Op); + + EVT IdiomVT = Op.getValueType(); + if (isPackedVectorType(IdiomVT) && + !supportsPackedMode(Op.getOpcode(), IdiomVT)) + return splitVectorOp(Op, CDAG); + // TODO: Implement odd/even splitting. return legalizePackedAVL(Op, CDAG); } +SDValue VETargetLowering::splitVectorOp(SDValue Op, VECustomDAG &CDAG) const { + MVT ResVT = splitVectorType(Op.getValue(0).getSimpleValueType()); + + auto AVLPos = getAVLPos(Op->getOpcode()); + auto MaskPos = getMaskPos(Op->getOpcode()); + + SDValue PackedMask = getNodeMask(Op); + auto AVLPair = getAnnotatedNodeAVL(Op); + SDValue PackedAVL = AVLPair.first; + assert(!AVLPair.second && "Expecting non pack-legalized oepration"); + + // request the parts + SDValue PartOps[2]; + + SDValue UpperPartAVL; // we will use this for packing things back together + for (PackElem Part : {PackElem::Hi, PackElem::Lo}) { + // VP ops already have an explicit mask and AVL. When expanding from non-VP + // attach those additional inputs here. + auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part); + + if (Part == PackElem::Hi) + UpperPartAVL = SplitTM.AVL; + + // Attach non-predicating value operands + SmallVector OpVec; + for (unsigned i = 0; i < Op.getNumOperands(); ++i) { + if (AVLPos && ((int)i) == *AVLPos) + continue; + if (MaskPos && ((int)i) == *MaskPos) + continue; + + // Value operand + auto PackedOperand = Op.getOperand(i); + auto UnpackedOpVT = splitVectorType(PackedOperand.getSimpleValueType()); + SDValue PartV = + CDAG.getUnpack(UnpackedOpVT, PackedOperand, Part, SplitTM.AVL); + OpVec.push_back(PartV); + } + + // Add predicating args and generate part node. + OpVec.push_back(SplitTM.Mask); + OpVec.push_back(SplitTM.AVL); + // Emit legal VVP nodes. + PartOps[(int)Part] = + CDAG.getNode(Op.getOpcode(), ResVT, OpVec, Op->getFlags()); + } + + // Re-package vectors. + return CDAG.getPack(Op.getValueType(), PartOps[(int)PackElem::Lo], + PartOps[(int)PackElem::Hi], UpperPartAVL); +} + SDValue VETargetLowering::legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const { LLVM_DEBUG(dbgs() << "::legalizePackedAVL\n";); diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def --- a/llvm/lib/Target/VE/VVPNodes.def +++ b/llvm/lib/Target/VE/VVPNodes.def @@ -38,31 +38,37 @@ ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME) #endif +/// REGISTER_PACKED(OPC) +/// \p OPC The VVP opcode of the operation. +#ifndef REGISTER_PACKED +#define REGISTER_PACKED(OPC) +#endif + // Integer arithmetic. -ADD_BINARY_VVP_OP_COMPACT(ADD) -ADD_BINARY_VVP_OP_COMPACT(SUB) +ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD) +ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB) ADD_BINARY_VVP_OP_COMPACT(MUL) ADD_BINARY_VVP_OP_COMPACT(UDIV) ADD_BINARY_VVP_OP_COMPACT(SDIV) -ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA) -ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL) -ADD_BINARY_VVP_OP_COMPACT(SHL) +ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA) REGISTER_PACKED(VVP_SRA) +ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL) REGISTER_PACKED(VVP_SRL) +ADD_BINARY_VVP_OP_COMPACT(SHL) REGISTER_PACKED(VVP_SHL) -ADD_BINARY_VVP_OP_COMPACT(AND) -ADD_BINARY_VVP_OP_COMPACT(OR) -ADD_BINARY_VVP_OP_COMPACT(XOR) +ADD_BINARY_VVP_OP_COMPACT(AND) REGISTER_PACKED(VVP_AND) +ADD_BINARY_VVP_OP_COMPACT(OR) REGISTER_PACKED(VVP_OR) +ADD_BINARY_VVP_OP_COMPACT(XOR) REGISTER_PACKED(VVP_XOR) // FP arithmetic. -ADD_BINARY_VVP_OP_COMPACT(FADD) -ADD_BINARY_VVP_OP_COMPACT(FSUB) -ADD_BINARY_VVP_OP_COMPACT(FMUL) +ADD_BINARY_VVP_OP_COMPACT(FADD) REGISTER_PACKED(VVP_FADD) +ADD_BINARY_VVP_OP_COMPACT(FSUB) REGISTER_PACKED(VVP_FSUB) +ADD_BINARY_VVP_OP_COMPACT(FMUL) REGISTER_PACKED(VVP_FMUL) ADD_BINARY_VVP_OP_COMPACT(FDIV) ADD_VVP_OP(VVP_SETCC, SETCC) // Shuffles. -ADD_VVP_OP(VVP_SELECT,VSELECT) +ADD_VVP_OP(VVP_SELECT,VSELECT) REGISTER_PACKED(VVP_SELECT) HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT) HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT) @@ -70,3 +76,4 @@ #undef ADD_BINARY_VVP_OP_COMPACT #undef ADD_VVP_OP #undef HANDLE_VP_TO_VVP +#undef REGISTER_PACKED diff --git a/llvm/test/CodeGen/VE/Packed/vp_fdiv.ll b/llvm/test/CodeGen/VE/Packed/vp_fdiv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Packed/vp_fdiv.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <512 x float> @llvm.vp.fdiv.v512f32(<512 x float>, <512 x float>, <512 x i1>, i32) + +define fastcc <512 x float> @test_vp_fdiv_v512f32_vv(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v512f32_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s0, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v2, %v1, %v1, 4 +; CHECK-NEXT: vshf %v3, %v0, %v0, 4 +; CHECK-NEXT: vfdiv.s %v2, %v3, %v2, %vm3 +; CHECK-NEXT: adds.w.sx %s0, 1, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfdiv.s %v0, %v0, %v1, %vm2 +; CHECK-NEXT: vshf %v0, %v2, %v0, 8 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <512 x float> @llvm.vp.fdiv.v512f32(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n) + ret <512 x float> %r0 +} + +define fastcc <512 x float> @test_vp_fdiv_v512f32_rv(float %s0, <512 x float> %i1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v512f32_rv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s0, (32)1 +; CHECK-NEXT: srl %s0, %s0, 32 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v1, %s0 +; CHECK-NEXT: adds.w.sx %s0, 1, %s1 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfdiv.s %v2, %v1, %v0, %vm2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v1, %v1, %v1, 4 +; CHECK-NEXT: vshf %v0, %v0, %v0, 4 +; CHECK-NEXT: vfdiv.s %v0, %v1, %v0, %vm3 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v0, %v0, %v2, 8 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <512 x float> undef, float %s0, i32 0 + %i0 = shufflevector <512 x float> %xins, <512 x float> undef, <512 x i32> zeroinitializer + %r0 = call <512 x float> @llvm.vp.fdiv.v512f32(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n) + ret <512 x float> %r0 +} + +define fastcc <512 x float> @test_vp_fdiv_v512f32_vr(<512 x float> %i0, float %s1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v512f32_vr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s0, (32)1 +; CHECK-NEXT: srl %s0, %s0, 32 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v1, %s0 +; CHECK-NEXT: adds.w.sx %s0, 1, %s1 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfdiv.s %v2, %v0, %v1, %vm2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v1, %v1, %v1, 4 +; CHECK-NEXT: vshf %v0, %v0, %v0, 4 +; CHECK-NEXT: vfdiv.s %v0, %v0, %v1, %vm3 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v0, %v0, %v2, 8 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <512 x float> undef, float %s1, i32 0 + %i1 = shufflevector <512 x float> %yins, <512 x float> undef, <512 x i32> zeroinitializer + %r0 = call <512 x float> @llvm.vp.fdiv.v512f32(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n) + ret <512 x float> %r0 +} diff --git a/llvm/test/CodeGen/VE/Packed/vp_mul.ll b/llvm/test/CodeGen/VE/Packed/vp_mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Packed/vp_mul.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <512 x i32> @llvm.vp.mul.v512i32(<512 x i32>, <512 x i32>, <512 x i1>, i32) + +define fastcc <512 x i32> @test_vp_v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s1, 1, %s0 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v2, %v1, %v1, 0 +; CHECK-NEXT: vshf %v3, %v0, %v0, 0 +; CHECK-NEXT: vmuls.w.sx %v2, %v3, %v2, %vm2 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vmuls.w.sx %v0, %v0, %v1, %vm3 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v0, %v0, %v2, 13 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <512 x i32> @llvm.vp.mul.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) + ret <512 x i32> %r0 +} diff --git a/llvm/test/CodeGen/VE/Packed/vp_sdiv.ll b/llvm/test/CodeGen/VE/Packed/vp_sdiv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Packed/vp_sdiv.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32>, <512 x i32>, <512 x i1>, i32) + +define fastcc <512 x i32> @test_vp_sdiv_v512i32_vv(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_sdiv_v512i32_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s1, 1, %s0 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v2, %v1, %v1, 0 +; CHECK-NEXT: vshf %v3, %v0, %v0, 0 +; CHECK-NEXT: vdivs.w.sx %v2, %v3, %v2, %vm2 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vdivs.w.sx %v0, %v0, %v1, %vm3 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v0, %v0, %v2, 13 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) + ret <512 x i32> %r0 +} + +define fastcc <512 x i32> @test_vp_sdiv_v512i32_rv(i32 %s0, <512 x i32> %i1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_sdiv_v512i32_rv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: sll %s2, %s0, 32 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v1, %s0 +; CHECK-NEXT: adds.w.sx %s0, 1, %s1 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v2, %v1, %v1, 0 +; CHECK-NEXT: vshf %v3, %v0, %v0, 0 +; CHECK-NEXT: vdivs.w.sx %v2, %v2, %v3, %vm2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivs.w.sx %v0, %v1, %v0, %vm3 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v0, %v0, %v2, 13 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <512 x i32> undef, i32 %s0, i32 0 + %i0 = shufflevector <512 x i32> %xins, <512 x i32> undef, <512 x i32> zeroinitializer + %r0 = call <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) + ret <512 x i32> %r0 +} + +define fastcc <512 x i32> @test_vp_sdiv_v512i32_vr(<512 x i32> %i0, i32 %s1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_sdiv_v512i32_vr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: sll %s2, %s0, 32 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v1, %s0 +; CHECK-NEXT: adds.w.sx %s0, 1, %s1 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v2, %v1, %v1, 0 +; CHECK-NEXT: vshf %v3, %v0, %v0, 0 +; CHECK-NEXT: vdivs.w.sx %v2, %v3, %v2, %vm2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivs.w.sx %v0, %v0, %v1, %vm3 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v0, %v0, %v2, 13 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <512 x i32> undef, i32 %s1, i32 0 + %i1 = shufflevector <512 x i32> %yins, <512 x i32> undef, <512 x i32> zeroinitializer + %r0 = call <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) + ret <512 x i32> %r0 +} diff --git a/llvm/test/CodeGen/VE/Packed/vp_udiv.ll b/llvm/test/CodeGen/VE/Packed/vp_udiv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Packed/vp_udiv.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32>, <512 x i32>, <512 x i1>, i32) + +define fastcc <512 x i32> @test_vp_udiv_v512i32_vv(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_udiv_v512i32_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s1, 1, %s0 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v2, %v1, %v1, 0 +; CHECK-NEXT: vshf %v3, %v0, %v0, 0 +; CHECK-NEXT: vdivu.w %v2, %v3, %v2, %vm2 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vdivu.w %v0, %v0, %v1, %vm3 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vshf %v0, %v0, %v2, 13 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) + ret <512 x i32> %r0 +} + +define fastcc <512 x i32> @test_vp_udiv_v512i32_rv(i32 %s0, <512 x i32> %i1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_udiv_v512i32_rv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: sll %s2, %s0, 32 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v1, %s0 +; CHECK-NEXT: adds.w.sx %s0, 1, %s1 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v2, %v1, %v1, 0 +; CHECK-NEXT: vshf %v3, %v0, %v0, 0 +; CHECK-NEXT: vdivu.w %v2, %v2, %v3, %vm2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivu.w %v0, %v1, %v0, %vm3 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v0, %v0, %v2, 13 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <512 x i32> undef, i32 %s0, i32 0 + %i0 = shufflevector <512 x i32> %xins, <512 x i32> undef, <512 x i32> zeroinitializer + %r0 = call <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) + ret <512 x i32> %r0 +} + +define fastcc <512 x i32> @test_vp_udiv_v512i32_vr(<512 x i32> %i0, i32 %s1, <512 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_udiv_v512i32_vr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: sll %s2, %s0, 32 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v1, %s0 +; CHECK-NEXT: adds.w.sx %s0, 1, %s1 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v2, %v1, %v1, 0 +; CHECK-NEXT: vshf %v3, %v0, %v0, 0 +; CHECK-NEXT: vdivu.w %v2, %v3, %v2, %vm2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivu.w %v0, %v0, %v1, %vm3 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vshf %v0, %v0, %v2, 13 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <512 x i32> undef, i32 %s1, i32 0 + %i1 = shufflevector <512 x i32> %yins, <512 x i32> undef, <512 x i32> zeroinitializer + %r0 = call <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) + ret <512 x i32> %r0 +}