diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h --- a/llvm/lib/Target/VE/VECustomDAG.h +++ b/llvm/lib/Target/VE/VECustomDAG.h @@ -188,6 +188,11 @@ SDValue annotateLegalAVL(SDValue AVL) const; VETargetMasks getTargetSplitMask(SDValue RawMask, SDValue RawAVL, PackElem Part) const; + + // Splitting support + SDValue getSplitPtrOffset(SDValue Ptr, SDValue ByteStride, + PackElem Part) const; + SDValue getSplitPtrStride(SDValue PackStride) const; }; } // namespace llvm diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp --- a/llvm/lib/Target/VE/VECustomDAG.cpp +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -155,6 +155,10 @@ return 1; case VEISD::VVP_SELECT: return 3; + case VEISD::VVP_LOAD: + return 4; + case VEISD::VVP_STORE: + return 5; } return None; @@ -431,4 +435,19 @@ return VETargetMasks(NewMask, NewAVL); } +SDValue VECustomDAG::getSplitPtrOffset(SDValue Ptr, SDValue ByteStride, + PackElem Part) const { + // High starts at base ptr but has more significant bits in the 64bit vector + // element. + if (Part == PackElem::Hi) + return Ptr; + return getNode(ISD::ADD, MVT::i64, {Ptr, ByteStride}); +} + +SDValue VECustomDAG::getSplitPtrStride(SDValue PackStride) const { + if (auto ConstBytes = dyn_cast(PackStride)) + return getConstant(2 * ConstBytes->getSExtValue(), MVT::i64); + return getNode(ISD::SHL, MVT::i64, {PackStride, getConstant(1, MVT::i32)}); +} + } // namespace llvm diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -189,7 +189,9 @@ SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG&) const; SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue legalizeInternalLoadStoreOp(SDValue Op, VECustomDAG &CDAG) const; SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const; + SDValue splitPackedLoadStore(SDValue Op, VECustomDAG &CDAG) const; SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const; SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const; /// } VVPLowering diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -114,8 +114,6 @@ auto DataVT = *getIdiomaticVectorType(Op.getNode()); auto Packing = getTypePacking(DataVT); - assert(Packing == Packing::Normal && "TODO Packed load store isel"); - // TODO: Infer lower AVL from mask. if (!AVL) AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32); @@ -150,10 +148,117 @@ {Chain, Data, BasePtr, StrideV, Mask, AVL}); } +SDValue VETargetLowering::splitPackedLoadStore(SDValue Op, + VECustomDAG &CDAG) const { + auto VVPOC = *getVVPOpcode(Op.getOpcode()); + assert((VVPOC == VEISD::VVP_LOAD) || (VVPOC == VEISD::VVP_STORE)); + + MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT(); + assert(getTypePacking(DataVT) == Packing::Dense && + "Can only split packed load/store"); + MVT SplitDataVT = splitVectorType(DataVT); + + SDValue PassThru = getNodePassthru(Op); + assert(!PassThru && "Should have been folded in lowering to VVP layer"); + + // Analyze the operation + SDValue PackedMask = getNodeMask(Op); + SDValue PackedAVL = getAnnotatedNodeAVL(Op).first; + SDValue PackPtr = getMemoryPtr(Op); + SDValue PackData = getStoredValue(Op); + SDValue PackStride = getLoadStoreStride(Op, CDAG); + + unsigned ChainResIdx = PackData ? 0 : 1; + + SDValue PartOps[2]; + + SDValue UpperPartAVL; // we will use this for packing things back together + for (PackElem Part : {PackElem::Hi, PackElem::Lo}) { + // VP ops already have an explicit mask and AVL. When expanding from non-VP + // attach those additional inputs here. + auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part); + + // Keep track of the (higher) lvl. + if (Part == PackElem::Hi) + UpperPartAVL = SplitTM.AVL; + + // Attach non-predicating value operands + SmallVector OpVec; + + // Chain + OpVec.push_back(getNodeChain(Op)); + + // Data + if (PackData) { + SDValue PartData = + CDAG.getUnpack(SplitDataVT, PackData, Part, SplitTM.AVL); + OpVec.push_back(PartData); + } + + // Ptr & Stride + // Push (ptr + ElemBytes * , 2 * ElemBytes) + // Stride info + // EVT DataVT = LegalizeVectorType(getMemoryDataVT(Op), Op, DAG, Mode); + OpVec.push_back(CDAG.getSplitPtrOffset(PackPtr, PackStride, Part)); + OpVec.push_back(CDAG.getSplitPtrStride(PackStride)); + + // Add predicating args and generate part node + OpVec.push_back(SplitTM.Mask); + OpVec.push_back(SplitTM.AVL); + + if (PackData) { + // Store + PartOps[(int)Part] = CDAG.getNode(VVPOC, MVT::Other, OpVec); + } else { + // Load + PartOps[(int)Part] = + CDAG.getNode(VVPOC, {SplitDataVT, MVT::Other}, OpVec); + } + } + + // Merge the chains + SDValue LowChain = SDValue(PartOps[(int)PackElem::Lo].getNode(), ChainResIdx); + SDValue HiChain = SDValue(PartOps[(int)PackElem::Hi].getNode(), ChainResIdx); + SDValue FusedChains = + CDAG.getNode(ISD::TokenFactor, MVT::Other, {LowChain, HiChain}); + + // Chain only [store] + if (PackData) + return FusedChains; + + // Re-pack into full packed vector result + MVT PackedVT = + getLegalVectorType(Packing::Dense, DataVT.getVectorElementType()); + SDValue PackedVals = CDAG.getPack(PackedVT, PartOps[(int)PackElem::Lo], + PartOps[(int)PackElem::Hi], UpperPartAVL); + + return CDAG.getMergeValues({PackedVals, FusedChains}); +} + +SDValue VETargetLowering::legalizeInternalLoadStoreOp(SDValue Op, + VECustomDAG &CDAG) const { + LLVM_DEBUG(dbgs() << "::legalizeInternalLoadStoreOp\n";); + MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT(); + + // TODO: Recognize packable load,store. + if (isPackedVectorType(DataVT)) + return splitPackedLoadStore(Op, CDAG); + + return legalizePackedAVL(Op, CDAG); +} + SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const { + LLVM_DEBUG(dbgs() << "::legalizeInternalVectorOp\n";); VECustomDAG CDAG(DAG, Op); + // Dispatch to specialized legalization functions. + switch (Op->getOpcode()) { + case VEISD::VVP_LOAD: + case VEISD::VVP_STORE: + return legalizeInternalLoadStoreOp(Op, CDAG); + } + EVT IdiomVT = Op.getValueType(); if (isPackedVectorType(IdiomVT) && !supportsPackedMode(Op.getOpcode(), IdiomVT)) @@ -229,7 +334,8 @@ // Half and round up EVL for 32bit element types. SDValue LegalAVL = AVL; - if (isPackedVectorType(Op.getValueType())) { + MVT IdiomVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT(); + if (isPackedVectorType(IdiomVT)) { assert(maySafelyIgnoreMask(Op) && "TODO Shift predication from EVL into Mask"); diff --git a/llvm/test/CodeGen/VE/Packed/vec_load.ll b/llvm/test/CodeGen/VE/Packed/vec_load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Packed/vec_load.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +declare <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %0, i32 immarg %1, <512 x i1> %2, <512 x float> %3) #0 + +; Function Attrs: nounwind +define fastcc <512 x float> @vec_mload_v512f32(<512 x float>* %P, <512 x i1> %M) { +; CHECK-LABEL: vec_mload_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vldu %v0, 8, %s0 +; CHECK-NEXT: lea %s0, 4(, %s0) +; CHECK-NEXT: vldu %v1, 8, %s0 +; CHECK-NEXT: vshf %v0, %v1, %v0, 8 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> undef) + ret <512 x float> %r +} + +; TODO: Packed select legalization +; Function Attrs: nounwind +; define fastcc <512 x float> @vec_mload_pt_v512f32(<512 x float>* %P, <512 x float> %PT, <512 x i1> %M) { +; %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> %PT) +; ret <512 x float> %r +; } + +declare <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %0, i32 immarg %1, <512 x i1> %2, <512 x i32> %3) #0 + +; Function Attrs: nounwind +define fastcc <512 x i32> @vec_mload_v512i32(<512 x i32>* %P, <512 x i1> %M) { +; CHECK-LABEL: vec_mload_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vldl.zx %v0, 8, %s0 +; CHECK-NEXT: lea %s0, 4(, %s0) +; CHECK-NEXT: vldl.zx %v1, 8, %s0 +; CHECK-NEXT: vshf %v0, %v1, %v0, 13 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> undef) + ret <512 x i32> %r +} + +; TODO: Packed select legalization +; ; Function Attrs: nounwind +; define fastcc <512 x i32> @vec_mload_pt_v512i32(<512 x i32>* %P, <512 x i32> %PT, <512 x i1> %M) { +; %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> %PT) +; ret <512 x i32> %r +; } + +attributes #0 = { argmemonly nounwind readonly willreturn } diff --git a/llvm/test/CodeGen/VE/Packed/vec_store.ll b/llvm/test/CodeGen/VE/Packed/vec_store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Packed/vec_store.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +declare void @llvm.masked.store.v512f32.p0v512f32(<512 x float>, <512 x float>*, i32 immarg, <512 x i1>) + +define fastcc void @vec_mstore_v512f32(<512 x float>* %P, <512 x float> %V, <512 x i1> %M) { +; CHECK-LABEL: vec_mstore_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vstu %v0, 8, %s0 +; CHECK-NEXT: vshf %v0, %v0, %v0, 4 +; CHECK-NEXT: lea %s0, 4(, %s0) +; CHECK-NEXT: vstu %v0, 8, %s0 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.masked.store.v512f32.p0v512f32(<512 x float> %V, <512 x float>* %P, i32 16, <512 x i1> %M) + ret void +} + + +declare void @llvm.masked.store.v512i32.p0v512i32(<512 x i32>, <512 x i32>*, i32 immarg, <512 x i1>) + +define fastcc void @vec_mstore_v512i32(<512 x i32>* %P, <512 x i32> %V, <512 x i1> %M) { +; CHECK-LABEL: vec_mstore_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 4(, %s0) +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vstl %v0, 8, %s1 +; CHECK-NEXT: vshf %v0, %v0, %v0, 0 +; CHECK-NEXT: vstl %v0, 8, %s0 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.masked.store.v512i32.p0v512i32(<512 x i32> %V, <512 x i32>* %P, i32 16, <512 x i1> %M) + ret void +}