diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h --- a/llvm/lib/Target/VE/VECustomDAG.h +++ b/llvm/lib/Target/VE/VECustomDAG.h @@ -25,6 +25,8 @@ bool isVVPBinaryOp(unsigned Opcode); +bool isPackedVectorType(EVT SomeVT); + class VECustomDAG { SelectionDAG &DAG; SDLoc DL; diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp --- a/llvm/lib/Target/VE/VECustomDAG.cpp +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -19,6 +19,14 @@ namespace llvm { +static const int StandardVectorWidth = 256; + +bool isPackedVectorType(EVT SomeVT) { + if (!SomeVT.isVector()) + return false; + return SomeVT.getVectorNumElements() > StandardVectorWidth; +} + /// \returns the VVP_* SDNode opcode corresponsing to \p OC. Optional getVVPOpcode(unsigned Opcode) { switch (Opcode) { @@ -51,6 +59,22 @@ SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar, SDValue AVL) const { + assert(ResultVT.isVector()); + auto ScaVT = Scalar.getValueType(); + assert(ScaVT != MVT::i1 && "TODO: Mask broadcasts"); + + if (isPackedVectorType(ResultVT)) { + // v512x packed mode broadcast + // Replicate the scalar reg (f32 or i32) onto the opposing half of the full + // scalar register. If it's an I64 type, assume that this has already + // happened. + if (ScaVT == MVT::f32) { + Scalar = getNode(VEISD::REPL_F32, MVT::i64, Scalar); + } else if (ScaVT == MVT::i32) { + Scalar = getNode(VEISD::REPL_I32, MVT::i64, Scalar); + } + } + return getNode(VEISD::VEC_BROADCAST, ResultVT, {Scalar, AVL}); } diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -40,6 +40,8 @@ TS1AM, // A TS1AM instruction used for 1/2 bytes swap. VEC_BROADCAST, // A vector broadcast instruction. // 0: scalar value, 1: VL + REPL_I32, + REPL_F32, // Replicate subregister to other half. // VVP_* nodes. #define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME, diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "VECustomDAG.h" #include "VEISelLowering.h" #include "MCTargetDesc/VEMCExpr.h" +#include "VECustomDAG.h" #include "VEInstrBuilder.h" #include "VEMachineFunctionInfo.h" #include "VERegisterInfo.h" @@ -899,6 +899,8 @@ TARGET_NODE_CASE(RET_FLAG) TARGET_NODE_CASE(TS1AM) TARGET_NODE_CASE(VEC_BROADCAST) + TARGET_NODE_CASE(REPL_I32) + TARGET_NODE_CASE(REPL_F32) // Register the VVP_* SDNodes. #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME) @@ -1642,8 +1644,7 @@ SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { VECustomDAG CDAG(DAG, Op); - unsigned NumEls = Op.getValueType().getVectorNumElements(); - MVT ElemVT = Op.getSimpleValueType().getVectorElementType(); + MVT ResultVT = Op.getSimpleValueType(); // If there is just one element, expand to INSERT_VECTOR_ELT. unsigned UniqueIdx; @@ -1651,17 +1652,17 @@ SDValue AccuV = CDAG.getUNDEF(Op.getValueType()); auto ElemV = Op->getOperand(UniqueIdx); SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64); - return CDAG.getNode(ISD::INSERT_VECTOR_ELT, Op.getValueType(), - {AccuV, ElemV, IdxV}); + return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV}); } // Else emit a broadcast. if (SDValue ScalarV = getSplatValue(Op.getNode())) { - // lower to VEC_BROADCAST - MVT LegalResVT = MVT::getVectorVT(ElemVT, 256); - - auto AVL = CDAG.getConstant(NumEls, MVT::i32); - return CDAG.getBroadcast(LegalResVT, Op.getOperand(0), AVL); + unsigned NumEls = ResultVT.getVectorNumElements(); + // TODO: Legalize packed-mode AVL. + // For now, cap the AVL at 256. + auto CappedLength = std::min(256, NumEls); + auto AVL = CDAG.getConstant(CappedLength, MVT::i32); + return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL); } // Expand diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -1576,6 +1576,12 @@ def l2f : OutPatFrag<(ops node:$exp), (EXTRACT_SUBREG $exp, sub_f32)>; +// Zero out subregisters. +def zero_i32 : OutPatFrag<(ops node:$expr), + (ANDrm $expr, 32)>; +def zero_f32 : OutPatFrag<(ops node:$expr), + (ANDrm $expr, !add(32, 64))>; + // Small immediates. def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>; def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>; @@ -2287,6 +2293,16 @@ def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2, [SDTCisVec<0>, IsVLVT<2>]>>; +// replicate lower 32bit to upper 32bit (f32 scalar replication). +def repl_f32 : SDNode<"VEISD::REPL_F32", + SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisFP<1>]>>; +// replicate upper 32bit to lower 32 bit (i32 scalar replication). +def repl_i32 : SDNode<"VEISD::REPL_I32", + SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisInt<1>]>>; + + // Whether this is an all-true mask (assuming undef-bits above VL are all-true). def true_mask : PatLeaf< (vec_broadcast (i32 nonzero), (i32 srcvalue))>; diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td --- a/llvm/lib/Target/VE/VEInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td @@ -15,6 +15,17 @@ // Instruction format superclass //===----------------------------------------------------------------------===// +// Sub-register replication for packed broadcast. +def: Pat<(i64 (repl_f32 f32:$val)), + (ORrr + (SRLri (f2l $val), 32), + (zero_i32 (f2l $val)))>; +def: Pat<(i64 (repl_i32 i32:$val)), + (ORrr + (zero_f32 (i2l $val)), + (SLLri (i2l $val), 32))>; + + multiclass vbrd_elem32 { // VBRDil @@ -89,3 +100,8 @@ defm : patterns_elem64; defm : patterns_elem64; + +defm : vbrd_elem64; +defm : vbrd_elem64; +defm : vbrd_elem64; +defm : vbrd_elem64; diff --git a/llvm/test/CodeGen/VE/Packed/vec_broadcast.ll b/llvm/test/CodeGen/VE/Packed/vec_broadcast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Packed/vec_broadcast.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +define fastcc <512 x i32> @brd_v512i32(i32 %s) { +; CHECK-LABEL: brd_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: sll %s1, %s0, 32 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <512 x i32> undef, i32 %s, i32 0 + %ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer + ret <512 x i32> %ret +} + +define fastcc <512 x i32> @brdi_v512i32() { +; CHECK-LABEL: brdi_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: or %s0, 17, (0)1 +; CHECK-NEXT: sll %s1, %s0, 32 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <512 x i32> undef, i32 17, i32 0 + %ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer + ret <512 x i32> %ret +} + +define fastcc <512 x float> @brd_v512f32(float %s) { +; CHECK-LABEL: brd_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s0, (32)1 +; CHECK-NEXT: srl %s0, %s0, 32 +; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <512 x float> undef, float %s, i32 0 + %ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer + ret <512 x float> %ret +} + +define fastcc <512 x float> @brdi_v512f32() { +; CHECK-LABEL: brdi_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea.sl %s0, 0 +; CHECK-NEXT: and %s1, %s0, (32)1 +; CHECK-NEXT: srl %s0, %s0, 32 +; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <512 x float> undef, float 0.e+00, i32 0 + %ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer + ret <512 x float> %ret +}