diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp --- a/llvm/lib/Target/VE/VECustomDAG.cpp +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -73,6 +73,11 @@ case ISD::SDNAME: \ return VEISD::VVPNAME; #include "VVPNodes.def" + // TODO: Map those in VVPNodes.def too + case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: + return VEISD::VVP_LOAD; + case ISD::EXPERIMENTAL_VP_STRIDED_STORE: + return VEISD::VVP_STORE; } return None; } @@ -275,10 +280,17 @@ } SDValue getLoadStoreStride(SDValue Op, VECustomDAG &CDAG) { - if (Op->getOpcode() == VEISD::VVP_STORE) + switch (Op->getOpcode()) { + case VEISD::VVP_STORE: return Op->getOperand(3); - if (Op->getOpcode() == VEISD::VVP_LOAD) + case VEISD::VVP_LOAD: return Op->getOperand(2); + } + + if (auto *StoreN = dyn_cast(Op.getNode())) + return StoreN->getStride(); + if (auto *StoreN = dyn_cast(Op.getNode())) + return StoreN->getStride(); if (isa(Op.getNode())) { // Regular MLOAD/MSTORE/LOAD/STORE @@ -309,6 +321,7 @@ SDValue getStoredValue(SDValue Op) { switch (Op->getOpcode()) { + case ISD::EXPERIMENTAL_VP_STRIDED_STORE: case VEISD::VVP_STORE: return Op->getOperand(1); } @@ -316,6 +329,8 @@ return StoreN->getValue(); if (auto *StoreN = dyn_cast(Op.getNode())) return StoreN->getValue(); + if (auto *StoreN = dyn_cast(Op.getNode())) + return StoreN->getValue(); if (auto *StoreN = dyn_cast(Op.getNode())) return StoreN->getValue(); if (auto *StoreN = dyn_cast(Op.getNode())) diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -315,6 +315,8 @@ setOperationAction(ISD::VP_OPC, LegalVecVT, Custom); #define ADD_VVP_OP(VVP_NAME, ISD_NAME) \ setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom); + setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_LOAD, LegalVecVT, Custom); + setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_STORE, LegalVecVT, Custom); #include "VVPNodes.def" } diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -134,6 +134,8 @@ // Load specific. SDValue PassThru = getNodePassthru(Op); + SDValue StrideV = getLoadStoreStride(Op, CDAG); + auto DataVT = *getIdiomaticVectorType(Op.getNode()); auto Packing = getTypePacking(DataVT); @@ -145,7 +147,6 @@ if (!Mask) Mask = CDAG.getConstantMask(Packing, true); - SDValue StrideV = getLoadStoreStride(Op, CDAG); if (IsLoad) { MVT LegalDataVT = getLegalVectorType( Packing, DataVT.getVectorElementType().getSimpleVT()); diff --git a/llvm/test/CodeGen/VE/Vector/vp_strided_load.ll b/llvm/test/CodeGen/VE/Vector/vp_strided_load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_strided_load.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc <256 x float> @vp_strided_load_v256f32_rrm(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256f32_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vseq %v0 +; CHECK-NEXT: vmulu.l %v0, %s1, %v0, %vm1 +; CHECK-NEXT: vaddu.l %v0, %s0, %v0, %vm1 +; CHECK-NEXT: vgtu %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret <256 x float> %r +} + +define fastcc <256 x float> @vp_strided_load_v256f32_rr(float* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256f32_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vldu %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret <256 x float> %r +} + +define fastcc <256 x float> @vp_strided_load_v256f32_ri(float* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256f32_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vldu %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret <256 x float> %r +} + +declare <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc <256 x i32> @vp_strided_load_v256i32_rrm(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256i32_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vseq %v0 +; CHECK-NEXT: vmulu.l %v0, %s1, %v0, %vm1 +; CHECK-NEXT: vaddu.l %v0, %s0, %v0, %vm1 +; CHECK-NEXT: vgtl.zx %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret <256 x i32> %r +} + +define fastcc <256 x i32> @vp_strided_load_v256i32_rr(i32* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256i32_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vldl.zx %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret <256 x i32> %r +} + +define fastcc <256 x i32> @vp_strided_load_v256i32_ri(i32* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256i32_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vldl.zx %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret <256 x i32> %r +} + +declare <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc <256 x double> @vp_strided_load_v256f64_rrm(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256f64_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vseq %v0 +; CHECK-NEXT: vmulu.l %v0, %s1, %v0, %vm1 +; CHECK-NEXT: vaddu.l %v0, %s0, %v0, %vm1 +; CHECK-NEXT: vgt %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret <256 x double> %r +} + +define fastcc <256 x double> @vp_strided_load_v256f64_rr(double* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256f64_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vld %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret <256 x double> %r +} + +define fastcc <256 x double> @vp_strided_load_v256f64_ri(double* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256f64_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vld %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret <256 x double> %r +} + +declare <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc <256 x i64> @vp_strided_load_v256i64_rrm(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256i64_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vseq %v0 +; CHECK-NEXT: vmulu.l %v0, %s1, %v0, %vm1 +; CHECK-NEXT: vaddu.l %v0, %s0, %v0, %vm1 +; CHECK-NEXT: vgt %v0, %v0, 0, 0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret <256 x i64> %r +} + +define fastcc <256 x i64> @vp_strided_load_v256i64_rr(i64* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256i64_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vld %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret <256 x i64> %r +} + +define fastcc <256 x i64> @vp_strided_load_v256i64_ri(i64* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_load_v256i64_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vld %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret <256 x i64> %r +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_strided_store.ll b/llvm/test/CodeGen/VE/Vector/vp_strided_store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_strided_store.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +declare void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc void @vp_strided_store_v256f32_rrm(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256f32_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vstu %v0, %s1, %s0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256f32_rr(<256 x float> %val, float* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256f32_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vstu %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256f32_ri(<256 x float> %val, float* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256f32_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vstu %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret void +} + +declare void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc void @vp_strided_store_v256i32_rrm(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256i32_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vstl %v0, %s1, %s0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256i32_rr(<256 x i32> %val, i32* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256i32_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vstl %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256i32_ri(<256 x i32> %val, i32* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256i32_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vstl %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret void +} + +declare void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc void @vp_strided_store_v256f64_rrm(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256f64_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vst %v0, %s1, %s0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256f64_rr(<256 x double> %val, double* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256f64_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vst %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256f64_ri(<256 x double> %val, double* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256f64_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vst %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret void +} + +declare void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + +define fastcc void @vp_strided_store_v256i64_rrm(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256i64_rrm: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vst %v0, %s1, %s0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + call void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256i64_rr(<256 x i64> %val, i64* %ptr, i64 %stride, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256i64_rr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vst %v0, %s1, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl) + ret void +} + +define fastcc void @vp_strided_store_v256i64_ri(<256 x i64> %val, i64* %ptr, i32 %evl) { +; CHECK-LABEL: vp_strided_store_v256i64_ri: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vst %v0, 24, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %one = insertelement <256 x i1> undef, i1 1, i32 0 + %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 24, <256 x i1> %allones, i32 %evl) + ret void +}