diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h --- a/llvm/lib/Target/VE/VECustomDAG.h +++ b/llvm/lib/Target/VE/VECustomDAG.h @@ -102,6 +102,10 @@ SDValue getNodePassthru(SDValue Op); +SDValue getGatherScatterIndex(SDValue Op); + +SDValue getGatherScatterScale(SDValue Op); + /// } Node Properties enum class Packing { @@ -193,6 +197,11 @@ SDValue getSplitPtrOffset(SDValue Ptr, SDValue ByteStride, PackElem Part) const; SDValue getSplitPtrStride(SDValue PackStride) const; + SDValue getGatherScatterAddress(SDValue BasePtr, SDValue Scale, SDValue Index, + SDValue Mask, SDValue AVL) const; + EVT getVectorVT(EVT ElemVT, unsigned NumElems) const { + return EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems); + } }; } // namespace llvm diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp --- a/llvm/lib/Target/VE/VECustomDAG.cpp +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -277,6 +277,26 @@ return SDValue(); } +SDValue getGatherScatterIndex(SDValue Op) { + if (auto *N = dyn_cast(Op.getNode())) { + return N->getIndex(); + } + if (auto *N = dyn_cast(Op.getNode())) { + return N->getIndex(); + } + return SDValue(); +} + +SDValue getGatherScatterScale(SDValue Op) { + if (auto *N = dyn_cast(Op.getNode())) { + return N->getScale(); + } + if (auto *N = dyn_cast(Op.getNode())) { + return N->getScale(); + } + return SDValue(); +} + SDValue getStoredValue(SDValue Op) { switch (Op->getOpcode()) { case VEISD::VVP_STORE: @@ -288,12 +308,19 @@ return StoreN->getValue(); if (auto *StoreN = dyn_cast(Op.getNode())) return StoreN->getValue(); + if (auto *StoreN = dyn_cast(Op.getNode())) + return StoreN->getValue(); + if (auto *StoreN = dyn_cast(Op.getNode())) + return StoreN->getValue(); return SDValue(); } SDValue getNodePassthru(SDValue Op) { if (auto *N = dyn_cast(Op.getNode())) return N->getPassThru(); + if (auto *N = dyn_cast(Op.getNode())) + return N->getPassThru(); + return SDValue(); } @@ -450,4 +477,30 @@ return getNode(ISD::SHL, MVT::i64, {PackStride, getConstant(1, MVT::i32)}); } +SDValue VECustomDAG::getGatherScatterAddress(SDValue BasePtr, SDValue Scale, + SDValue Index, SDValue Mask, + SDValue AVL) const { + EVT IndexVT = Index.getValueType(); + + // Apply scale. + SDValue ScaledIndex; + if (!Scale || isOneConstant(Scale)) + ScaledIndex = Index; + else { + SDValue ScaleBroadcast = getBroadcast(IndexVT, Scale, AVL); + ScaledIndex = + getNode(VEISD::VVP_MUL, IndexVT, {Index, ScaleBroadcast, Mask, AVL}); + } + + // Add basePtr. + if (isNullConstant(BasePtr)) + return ScaledIndex; + + // re-constitute pointer vector (basePtr + index * scale) + SDValue BaseBroadcast = getBroadcast(IndexVT, BasePtr, AVL); + auto ResPtr = + getNode(VEISD::VVP_ADD, IndexVT, {BaseBroadcast, ScaledIndex, Mask, AVL}); + return ResPtr; +} + } // namespace llvm diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -186,7 +186,8 @@ /// VVP Lowering { SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG&) const; + SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG &) const; + SDValue lowerVVP_GATHER_SCATTER(SDValue Op, VECustomDAG &) const; SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue legalizeInternalLoadStoreOp(SDValue Op, VECustomDAG &CDAG) const; diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -51,7 +51,10 @@ case VEISD::VVP_LOAD: case VEISD::VVP_STORE: return lowerVVP_LOAD_STORE(Op, CDAG); - }; + case VEISD::VVP_GATHER: + case VEISD::VVP_SCATTER: + return lowerVVP_GATHER_SCATTER(Op, CDAG); + } EVT OpVecVT = Op.getValueType(); EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT); @@ -235,6 +238,54 @@ return CDAG.getMergeValues({PackedVals, FusedChains}); } +SDValue VETargetLowering::lowerVVP_GATHER_SCATTER(SDValue Op, + VECustomDAG &CDAG) const { + EVT DataVT = *getIdiomaticVectorType(Op.getNode()); + auto Packing = getTypePacking(DataVT); + MVT LegalDataVT = + getLegalVectorType(Packing, DataVT.getVectorElementType().getSimpleVT()); + + SDValue AVL = getAnnotatedNodeAVL(Op).first; + SDValue Index = getGatherScatterIndex(Op); + SDValue BasePtr = getMemoryPtr(Op); + SDValue Mask = getNodeMask(Op); + SDValue Chain = getNodeChain(Op); + SDValue Scale = getGatherScatterScale(Op); + SDValue PassThru = getNodePassthru(Op); + SDValue StoredValue = getStoredValue(Op); + if (PassThru && PassThru->isUndef()) + PassThru = SDValue(); + + bool IsScatter = (bool)StoredValue; + + // TODO: Infer lower AVL from mask. + if (!AVL) + AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32); + + // Default to the all-true mask. + if (!Mask) + Mask = CDAG.getConstantMask(Packing, true); + + SDValue AddressVec = + CDAG.getGatherScatterAddress(BasePtr, Scale, Index, Mask, AVL); + if (IsScatter) + return CDAG.getNode(VEISD::VVP_SCATTER, MVT::Other, + {Chain, StoredValue, AddressVec, Mask, AVL}); + + // Gather. + SDValue NewLoadV = CDAG.getNode(VEISD::VVP_GATHER, {LegalDataVT, MVT::Other}, + {Chain, AddressVec, Mask, AVL}); + + if (!PassThru) + return NewLoadV; + + // TODO: Use vvp_select + SDValue DataV = CDAG.getNode(VEISD::VVP_SELECT, LegalDataVT, + {NewLoadV, PassThru, Mask, AVL}); + SDValue NewLoadChainV = SDValue(NewLoadV.getNode(), 1); + return CDAG.getMergeValues({DataV, NewLoadChainV}); +} + SDValue VETargetLowering::legalizeInternalLoadStoreOp(SDValue Op, VECustomDAG &CDAG) const { LLVM_DEBUG(dbgs() << "::legalizeInternalLoadStoreOp\n";); diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td --- a/llvm/lib/Target/VE/VVPInstrInfo.td +++ b/llvm/lib/Target/VE/VVPInstrInfo.td @@ -36,6 +36,23 @@ IsVLVT<4> ]>; +// vvp_scatter(chain, data, addr, mask, avl) +def SDTScatterVVP: SDTypeProfile<0, 4, [ + SDTCisVec<0>, + SDTCisVec<1>, + SDTCisVec<2>, + SDTCisSameNumEltsAs<0, 2>, + IsVLVT<3> +]>; + +// vvp_gather(chain, addr, mask, avl) +def SDTGatherVVP: SDTypeProfile<1, 3, [ + SDTCisVec<0>, + SDTCisVec<1>, + SDTCisSameNumEltsAs<0, 2>, + IsVLVT<3> +]>; + // Binary Operators { // BinaryOp(x,y,mask,vl) @@ -120,6 +137,11 @@ // } Binary Operators +def vvp_scatter : SDNode<"VEISD::VVP_SCATTER", SDTScatterVVP, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def vvp_gather : SDNode<"VEISD::VVP_GATHER", SDTGatherVVP, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + def vvp_load : SDNode<"VEISD::VVP_LOAD", SDTLoadVVP, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand ]>; def vvp_store : SDNode<"VEISD::VVP_STORE", SDTStoreVVP, diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td --- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td @@ -94,6 +94,41 @@ defm : VectorLoad; defm : VectorLoad; +// Vector Gather and scatter +multiclass VectorGather { + // Unmasked. + def : Pat<(DataVT (vvp_gather + PtrVT:$addr, (MaskVT true_mask), i32:$avl)), + (!cast(GTPrefix#"vizl") $addr, 0, 0, $avl)>; + // Masked. + def : Pat<(DataVT (vvp_gather PtrVT:$addr, MaskVT:$mask, i32:$avl)), + (!cast(GTPrefix#"vizml") $addr, 0, 0, $mask, $avl)>; +} + +defm : VectorGather; +defm : VectorGather; +defm : VectorGather; +defm : VectorGather; + +multiclass VectorScatter { + // Unmasked. + def : Pat<(vvp_scatter + DataVT:$data, PtrVT:$addr, (MaskVT true_mask), i32:$avl), + (!cast(SCPrefix#"vizvl") $addr, 0, 0, $data, $avl)>; + // Masked. + def : Pat<(vvp_scatter + DataVT:$data, PtrVT:$addr, MaskVT:$mask, i32:$avl), + (!cast(SCPrefix#"vizvml") $addr, 0, 0, $data, $mask, $avl)>; +} + +defm : VectorScatter; +defm : VectorScatter; +defm : VectorScatter; +defm : VectorScatter; multiclass Binary_rv @llvm.masked.gather.v256f64.v256p0f64(<256 x double*> %0, i32 immarg %1, <256 x i1> %2, <256 x double> %3) #0 + +; Function Attrs: nounwind +define fastcc <256 x double> @vec_mgather_v256f64(<256 x double*> %P, <256 x i1> %M) { +; CHECK-LABEL: vec_mgather_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgt %v0, %v0, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x double> @llvm.masked.gather.v256f64.v256p0f64(<256 x double*> %P, i32 4, <256 x i1> %M, <256 x double> undef) + ret <256 x double> %r +} + +; Function Attrs: nounwind +define fastcc <256 x double> @vec_mgather_pt_v256f64(<256 x double*> %P, <256 x double> %PT, <256 x i1> %M) { +; CHECK-LABEL: vec_mgather_pt_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgt %v0, %v0, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x double> @llvm.masked.gather.v256f64.v256p0f64(<256 x double*> %P, i32 4, <256 x i1> %M, <256 x double> %PT) + ret <256 x double> %r +} + + +declare <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> %0, i32 immarg %1, <256 x i1> %2, <256 x float> %3) #0 + +; Function Attrs: nounwind +define fastcc <256 x float> @vec_mgather_v256f32(<256 x float*> %P, <256 x i1> %M) { +; CHECK-LABEL: vec_mgather_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgtu %v0, %v0, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> %P, i32 4, <256 x i1> %M, <256 x float> undef) + ret <256 x float> %r +} + +; Function Attrs: nounwind +define fastcc <256 x float> @vec_mgather_pt_v256f32(<256 x float*> %P, <256 x float> %PT, <256 x i1> %M) { +; CHECK-LABEL: vec_mgather_pt_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgtu %v0, %v0, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> %P, i32 4, <256 x i1> %M, <256 x float> %PT) + ret <256 x float> %r +} + + +declare <256 x i32> @llvm.masked.gather.v256i32.v256p0i32(<256 x i32*> %0, i32 immarg %1, <256 x i1> %2, <256 x i32> %3) #0 + +; Function Attrs: nounwind +define fastcc <256 x i32> @vec_mgather_v256i32(<256 x i32*> %P, <256 x i1> %M) { +; CHECK-LABEL: vec_mgather_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgtl.zx %v0, %v0, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x i32> @llvm.masked.gather.v256i32.v256p0i32(<256 x i32*> %P, i32 4, <256 x i1> %M, <256 x i32> undef) + ret <256 x i32> %r +} + +; Function Attrs: nounwind +define fastcc <256 x i32> @vec_mgather_pt_v256i32(<256 x i32*> %P, <256 x i32> %PT, <256 x i1> %M) { +; CHECK-LABEL: vec_mgather_pt_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgtl.zx %v0, %v0, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x i32> @llvm.masked.gather.v256i32.v256p0i32(<256 x i32*> %P, i32 4, <256 x i1> %M, <256 x i32> %PT) + ret <256 x i32> %r +} + +attributes #0 = { argmemonly nounwind readonly willreturn } diff --git a/llvm/test/CodeGen/VE/Vector/vec_scatter.ll b/llvm/test/CodeGen/VE/Vector/vec_scatter.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_scatter.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +declare void @llvm.masked.scatter.v256i64.v256p0i64(<256 x i64>, <256 x i64*>, i32 immarg, <256 x i1>) #0 + +; Function Attrs: nounwind +define fastcc void @vec_mscatter_v256i64(<256 x i64> %V, <256 x i64*> %P, <256 x i1> %M) { +; CHECK-LABEL: vec_mscatter_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vsc %v0, %v1, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.masked.scatter.v256i64.v256p0i64(<256 x i64> %V, <256 x i64*> %P, i32 4, <256 x i1> %M) + ret void +} + +declare void @llvm.masked.scatter.v256f64.v256p0f64(<256 x double>, <256 x double*>, i32 immarg, <256 x i1>) #0 + +; Function Attrs: nounwind +define fastcc void @vec_mscatter_v256f64(<256 x double> %V, <256 x double*> %P, <256 x i1> %M) { +; CHECK-LABEL: vec_mscatter_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vsc %v0, %v1, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.masked.scatter.v256f64.v256p0f64(<256 x double> %V, <256 x double*> %P, i32 4, <256 x i1> %M) + ret void +} + +declare void @llvm.masked.scatter.v256f32.v256p0f32(<256 x float>, <256 x float*>, i32 immarg, <256 x i1>) #0 + +; Function Attrs: nounwind +define fastcc void @vec_mscatter_v256f32(<256 x float> %V, <256 x float*> %P, <256 x i1> %M) { +; CHECK-LABEL: vec_mscatter_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vscu %v0, %v1, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.masked.scatter.v256f32.v256p0f32(<256 x float> %V, <256 x float*> %P, i32 4, <256 x i1> %M) + ret void +} + +declare void @llvm.masked.scatter.v256i32.v256p0i32(<256 x i32>, <256 x i32*>, i32 immarg, <256 x i1>) #0 + +; Function Attrs: nounwind +define fastcc void @vec_mscatter_v256i32(<256 x i32> %V, <256 x i32*> %P, <256 x i1> %M) { +; CHECK-LABEL: vec_mscatter_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vscl %v0, %v1, 0, 0 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.masked.scatter.v256i32.v256p0i32(<256 x i32> %V, <256 x i32*> %P, i32 4, <256 x i1> %M) + ret void +} + +attributes #0 = { argmemonly nounwind readonly willreturn } diff --git a/llvm/test/CodeGen/VE/Vector/vp_gather.ll b/llvm/test/CodeGen/VE/Vector/vp_gather.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_gather.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +declare <256 x i64> @llvm.vp.gather.v256i64.v256p0i64(<256 x i64*>, <256 x i1>, i32) + +; Function Attrs: nounwind +define fastcc <256 x i64> @vp_gather_v256i64(<256 x i64*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_gather_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgt %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x i64> @llvm.vp.gather.v256i64.v256p0i64(<256 x i64*> %P, <256 x i1> %M, i32 %avl) + ret <256 x i64> %r +} + +declare <256 x double> @llvm.vp.gather.v256f64.v256p0f64(<256 x double*>, <256 x i1>, i32) + +; Function Attrs: nounwind +define fastcc <256 x double> @vp_gather_v256f64(<256 x double*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_gather_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgt %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x double> @llvm.vp.gather.v256f64.v256p0f64(<256 x double*> %P, <256 x i1> %M, i32 %avl) + ret <256 x double> %r +} + +declare <256 x float> @llvm.vp.gather.v256f32.v256p0f32(<256 x float*>, <256 x i1>, i32) + +; Function Attrs: nounwind +define fastcc <256 x float> @vp_gather_v256f32(<256 x float*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_gather_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgtu %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x float> @llvm.vp.gather.v256f32.v256p0f32(<256 x float*> %P, <256 x i1> %M, i32 %avl) + ret <256 x float> %r +} + +declare <256 x i32> @llvm.vp.gather.v256i32.v256p0i32(<256 x i32*>, <256 x i1>, i32) + +; Function Attrs: nounwind +define fastcc <256 x i32> @vp_gather_v256i32(<256 x i32*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_gather_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vgtl.zx %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x i32> @llvm.vp.gather.v256i32.v256p0i32(<256 x i32*> %P, <256 x i1> %M, i32 %avl) + ret <256 x i32> %r +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_scatter.ll b/llvm/test/CodeGen/VE/Vector/vp_scatter.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_scatter.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +declare void @llvm.vp.scatter.v256i64.v256p0i64(<256 x i64>, <256 x i64*>, <256 x i1>, i32 %avl) + +; Function Attrs: nounwind +define fastcc void @vp_mscatter_v256i64(<256 x i64> %V, <256 x i64*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_mscatter_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vsc %v0, %v1, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.vp.scatter.v256i64.v256p0i64(<256 x i64> %V, <256 x i64*> %P, <256 x i1> %M, i32 %avl) + ret void +} + +declare void @llvm.vp.scatter.v256f64.v256p0f64(<256 x double>, <256 x double*>, <256 x i1>, i32 %avl) + +; Function Attrs: nounwind +define fastcc void @vp_mscatter_v256f64(<256 x double> %V, <256 x double*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_mscatter_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vsc %v0, %v1, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.vp.scatter.v256f64.v256p0f64(<256 x double> %V, <256 x double*> %P, <256 x i1> %M, i32 %avl) + ret void +} + +declare void @llvm.vp.scatter.v256f32.v256p0f32(<256 x float>, <256 x float*>, <256 x i1>, i32 %avl) + +; Function Attrs: nounwind +define fastcc void @vp_mscatter_v256f32(<256 x float> %V, <256 x float*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_mscatter_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vscu %v0, %v1, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.vp.scatter.v256f32.v256p0f32(<256 x float> %V, <256 x float*> %P, <256 x i1> %M, i32 %avl) + ret void +} + +declare void @llvm.vp.scatter.v256i32.v256p0i32(<256 x i32>, <256 x i32*>, <256 x i1>, i32 %avl) + +; Function Attrs: nounwind +define fastcc void @vp_mscatter_v256i32(<256 x i32> %V, <256 x i32*> %P, <256 x i1> %M, i32 %avl) { +; CHECK-LABEL: vp_mscatter_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vscl %v0, %v1, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.vp.scatter.v256i32.v256p0i32(<256 x i32> %V, <256 x i32*> %P, <256 x i1> %M, i32 %avl) + ret void +} +