diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -343,6 +343,13 @@ for (unsigned IntRedOpc : IntReductionOCs) setOperationAction(IntRedOpc, VT, Custom); } + + // v256i1 and v512i1 ops + for (MVT MaskVT : AllMaskVTs) { + // Custom lower mask ops + setOperationAction(ISD::STORE, MaskVT, Custom); + setOperationAction(ISD::LOAD, MaskVT, Custom); + } } SDValue @@ -1339,6 +1346,72 @@ return DAG.getMergeValues(Ops, DL); } +// Lower a vXi1 load into following instructions +// LDrii %1, (,%addr) +// LVMxir %vm, 0, %1 +// LDrii %2, 8(,%addr) +// LVMxir %vm, 0, %2 +// ... +static SDValue lowerLoadI1(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + LoadSDNode *LdNode = dyn_cast(Op.getNode()); + assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type"); + + SDValue BasePtr = LdNode->getBasePtr(); + unsigned Alignment = LdNode->getAlign().value(); + if (Alignment > 8) + Alignment = 8; + + EVT AddrVT = BasePtr.getValueType(); + EVT MemVT = LdNode->getMemoryVT(); + if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) { + SDValue OutChains[4]; + SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT); + for (int i = 0; i < 4; ++i) { + // Generate load dag and prepare chains. + SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr, + DAG.getConstant(8 * i, DL, AddrVT)); + SDValue Val = + DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr, + LdNode->getPointerInfo(), Alignment, + LdNode->isVolatile() ? MachineMemOperand::MOVolatile + : MachineMemOperand::MONone); + OutChains[i] = SDValue(Val.getNode(), 1); + + VM = DAG.getMachineNode(VE::LVMir_m, DL, MVT::i64, + DAG.getTargetConstant(i, DL, MVT::i64), Val, + SDValue(VM, 0)); + } + SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); + SDValue Ops[2] = {SDValue(VM, 0), OutChain}; + return DAG.getMergeValues(Ops, DL); + } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) { + SDValue OutChains[8]; + SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT); + for (int i = 0; i < 8; ++i) { + // Generate load dag and prepare chains. + SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr, + DAG.getConstant(8 * i, DL, AddrVT)); + SDValue Val = + DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr, + LdNode->getPointerInfo(), Alignment, + LdNode->isVolatile() ? MachineMemOperand::MOVolatile + : MachineMemOperand::MONone); + OutChains[i] = SDValue(Val.getNode(), 1); + + VM = DAG.getMachineNode(VE::LVMyir_y, DL, MVT::i64, + DAG.getTargetConstant(i, DL, MVT::i64), Val, + SDValue(VM, 0)); + } + SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); + SDValue Ops[2] = {SDValue(VM, 0), OutChain}; + return DAG.getMergeValues(Ops, DL); + } else { + // Otherwise, ask llvm to expand it. + return SDValue(); + } +} + SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const { LoadSDNode *LdNode = cast(Op.getNode()); @@ -1357,6 +1430,8 @@ if (MemVT == MVT::f128) return lowerLoadF128(Op, DAG); + if (isMaskType(MemVT)) + return lowerLoadI1(Op, DAG); return Op; } @@ -1397,11 +1472,64 @@ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); } +// Lower a vXi1 store into following instructions +// SVMi %1, %vm, 0 +// STrii %1, (,%addr) +// SVMi %2, %vm, 1 +// STrii %2, 8(,%addr) +// ... +static SDValue lowerStoreI1(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + StoreSDNode *StNode = dyn_cast(Op.getNode()); + assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type"); + + SDValue BasePtr = StNode->getBasePtr(); + unsigned Alignment = StNode->getAlign().value(); + if (Alignment > 8) + Alignment = 8; + EVT AddrVT = BasePtr.getValueType(); + EVT MemVT = StNode->getMemoryVT(); + if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) { + SDValue OutChains[4]; + for (int i = 0; i < 4; ++i) { + SDNode *V = + DAG.getMachineNode(VE::SVMmi, DL, MVT::i64, StNode->getValue(), + DAG.getTargetConstant(i, DL, MVT::i64)); + SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr, + DAG.getConstant(8 * i, DL, AddrVT)); + OutChains[i] = + DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr, + MachinePointerInfo(), Alignment, + StNode->isVolatile() ? MachineMemOperand::MOVolatile + : MachineMemOperand::MONone); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); + } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) { + SDValue OutChains[8]; + for (int i = 0; i < 8; ++i) { + SDNode *V = + DAG.getMachineNode(VE::SVMyi, DL, MVT::i64, StNode->getValue(), + DAG.getTargetConstant(i, DL, MVT::i64)); + SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr, + DAG.getConstant(8 * i, DL, AddrVT)); + OutChains[i] = + DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr, + MachinePointerInfo(), Alignment, + StNode->isVolatile() ? MachineMemOperand::MOVolatile + : MachineMemOperand::MONone); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); + } else { + // Otherwise, ask llvm to expand it. + return SDValue(); + } +} + SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *StNode = cast(Op.getNode()); assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type"); - // always expand non-mask vector loads to VVP + // always expand non-mask vector loads to VVP EVT MemVT = StNode->getMemoryVT(); if (MemVT.isVector() && !isMaskType(MemVT)) return lowerToVVP(Op, DAG); @@ -1415,6 +1543,8 @@ if (MemVT == MVT::f128) return lowerStoreF128(Op, DAG); + if (isMaskType(MemVT)) + return lowerStoreI1(Op, DAG); // Otherwise, ask llvm to expand it. return SDValue(); diff --git a/llvm/test/CodeGen/VE/Vector/loadvm.ll b/llvm/test/CodeGen/VE/Vector/loadvm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/loadvm.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +@v256i1 = common dso_local local_unnamed_addr global <256 x i1> zeroinitializer, align 4 +@v512i1 = common dso_local local_unnamed_addr global <512 x i1> zeroinitializer, align 4 + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x i1> @loadv256i1(<256 x i1>* nocapture readonly %mp) { +; CHECK-LABEL: loadv256i1: +; CHECK: # %bb.0: +; CHECK-NEXT: ld %s1, (, %s0) +; CHECK-NEXT: ld %s2, 8(, %s0) +; CHECK-NEXT: ld %s3, 16(, %s0) +; CHECK-NEXT: ld %s0, 24(, %s0) +; CHECK-NEXT: lvm %vm1, 0, %s1 +; CHECK-NEXT: lvm %vm1, 1, %s2 +; CHECK-NEXT: lvm %vm1, 2, %s3 +; CHECK-NEXT: lvm %vm1, 3, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %m = load <256 x i1>, <256 x i1>* %mp, align 16 + ret <256 x i1> %m +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x i1> @loadv256i1com() { +; CHECK-LABEL: loadv256i1com: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, v256i1@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s0, v256i1@hi(, %s0) +; CHECK-NEXT: ld %s1, (, %s0) +; CHECK-NEXT: ld %s2, 8(, %s0) +; CHECK-NEXT: ld %s3, 16(, %s0) +; CHECK-NEXT: ld %s0, 24(, %s0) +; CHECK-NEXT: lvm %vm1, 0, %s1 +; CHECK-NEXT: lvm %vm1, 1, %s2 +; CHECK-NEXT: lvm %vm1, 2, %s3 +; CHECK-NEXT: lvm %vm1, 3, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %m = load <256 x i1>, <256 x i1>* @v256i1, align 16 + ret <256 x i1> %m +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <512 x i1> @loadv512i1(<512 x i1>* nocapture readonly %mp) { +; CHECK-LABEL: loadv512i1: +; CHECK: # %bb.0: +; CHECK-NEXT: ld %s1, (, %s0) +; CHECK-NEXT: ld %s2, 8(, %s0) +; CHECK-NEXT: ld %s3, 16(, %s0) +; CHECK-NEXT: ld %s4, 24(, %s0) +; CHECK-NEXT: lvm %vm3, 0, %s1 +; CHECK-NEXT: lvm %vm3, 1, %s2 +; CHECK-NEXT: lvm %vm3, 2, %s3 +; CHECK-NEXT: lvm %vm3, 3, %s4 +; CHECK-NEXT: ld %s1, 32(, %s0) +; CHECK-NEXT: ld %s2, 40(, %s0) +; CHECK-NEXT: ld %s3, 48(, %s0) +; CHECK-NEXT: ld %s0, 56(, %s0) +; CHECK-NEXT: lvm %vm2, 0, %s1 +; CHECK-NEXT: lvm %vm2, 1, %s2 +; CHECK-NEXT: lvm %vm2, 2, %s3 +; CHECK-NEXT: lvm %vm2, 3, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %m = load <512 x i1>, <512 x i1>* %mp, align 16 + ret <512 x i1> %m +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <512 x i1> @loadv512i1com() { +; CHECK-LABEL: loadv512i1com: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, v512i1@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s0, v512i1@hi(, %s0) +; CHECK-NEXT: ld %s1, (, %s0) +; CHECK-NEXT: ld %s2, 8(, %s0) +; CHECK-NEXT: ld %s3, 16(, %s0) +; CHECK-NEXT: ld %s4, 24(, %s0) +; CHECK-NEXT: lvm %vm3, 0, %s1 +; CHECK-NEXT: lvm %vm3, 1, %s2 +; CHECK-NEXT: lvm %vm3, 2, %s3 +; CHECK-NEXT: lvm %vm3, 3, %s4 +; CHECK-NEXT: ld %s1, 32(, %s0) +; CHECK-NEXT: ld %s2, 40(, %s0) +; CHECK-NEXT: ld %s3, 48(, %s0) +; CHECK-NEXT: ld %s0, 56(, %s0) +; CHECK-NEXT: lvm %vm2, 0, %s1 +; CHECK-NEXT: lvm %vm2, 1, %s2 +; CHECK-NEXT: lvm %vm2, 2, %s3 +; CHECK-NEXT: lvm %vm2, 3, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %m = load <512 x i1>, <512 x i1>* @v512i1, align 16 + ret <512 x i1> %m +} + diff --git a/llvm/test/CodeGen/VE/Vector/loadvr.ll b/llvm/test/CodeGen/VE/Vector/loadvr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/loadvr.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +@v256i64 = common dso_local local_unnamed_addr global <256 x i64> zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x i64> @loadv256i64(<256 x i64>* nocapture readonly) { +; CHECK-LABEL: loadv256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vld %v0, 8, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %2 = load <256 x i64>, <256 x i64>* %0, align 16 + ret <256 x i64> %2 +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x double> @loadv256f64(<256 x double>* nocapture readonly) { +; CHECK-LABEL: loadv256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vld %v0, 8, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %2 = load <256 x double>, <256 x double>* %0, align 16 + ret <256 x double> %2 +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x i32> @loadv256i32(<256 x i32>* nocapture readonly) { +; CHECK-LABEL: loadv256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vldl.zx %v0, 4, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %2 = load <256 x i32>, <256 x i32>* %0, align 16 + ret <256 x i32> %2 +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x float> @loadv256f32(<256 x float>* nocapture readonly) { +; CHECK-LABEL: loadv256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vldu %v0, 4, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %2 = load <256 x float>, <256 x float>* %0, align 16 + ret <256 x float> %2 +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x i64> @loadv256i64stk() { +; CHECK-LABEL: loadv256i64stk: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s11, -2048(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lea %s1, (, %s11) +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vld %v0, 8, %s1 +; CHECK-NEXT: lea %s11, 2048(, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %addr = alloca <256 x i64>, align 16 + %1 = load <256 x i64>, <256 x i64>* %addr, align 16 + ret <256 x i64> %1 +} + +; Function Attrs: norecurse nounwind readonly +define fastcc <256 x i64> @loadv256i64com() { +; CHECK-LABEL: loadv256i64com: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, v256i64@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s0, v256i64@hi(, %s0) +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vld %v0, 8, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %1 = load <256 x i64>, <256 x i64>* @v256i64, align 16 + ret <256 x i64> %1 +} diff --git a/llvm/test/CodeGen/VE/Vector/storevm.ll b/llvm/test/CodeGen/VE/Vector/storevm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/storevm.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +@v256i1 = common dso_local local_unnamed_addr global <256 x i1> zeroinitializer, align 4 +@v512i1 = common dso_local local_unnamed_addr global <512 x i1> zeroinitializer, align 4 + +; Function Attrs: norecurse nounwind readonly +define fastcc void @storev256i1(<256 x i1>* nocapture %mp, <256 x i1> %m) { +; CHECK-LABEL: storev256i1: +; CHECK: # %bb.0: +; CHECK-NEXT: svm %s1, %vm1, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: b.l.t (, %s10) + store <256 x i1> %m, <256 x i1>* %mp, align 16 + ret void +} + +; Function Attrs: norecurse nounwind readonly +define fastcc void @storev256i1com(<256 x i1> %m) { +; CHECK-LABEL: storev256i1com: +; CHECK: # %bb.0: +; CHECK-NEXT: svm %s0, %vm1, 3 +; CHECK-NEXT: lea %s1, v256i1@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s1, v256i1@hi(, %s1) +; CHECK-NEXT: st %s0, 24(, %s1) +; CHECK-NEXT: svm %s0, %vm1, 2 +; CHECK-NEXT: st %s0, 16(, %s1) +; CHECK-NEXT: svm %s0, %vm1, 1 +; CHECK-NEXT: st %s0, 8(, %s1) +; CHECK-NEXT: svm %s0, %vm1, 0 +; CHECK-NEXT: st %s0, (, %s1) +; CHECK-NEXT: b.l.t (, %s10) + store <256 x i1> %m, <256 x i1>* @v256i1, align 16 + ret void +} + +; Function Attrs: norecurse nounwind readonly +define fastcc void @storev512i1(<512 x i1>* nocapture %mp, <512 x i1> %m) { +; CHECK-LABEL: storev512i1: +; CHECK: # %bb.0: +; CHECK-NEXT: svm %s1, %vm2, 3 +; CHECK-NEXT: st %s1, 56(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 2 +; CHECK-NEXT: st %s1, 48(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 1 +; CHECK-NEXT: st %s1, 40(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 0 +; CHECK-NEXT: st %s1, 32(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: b.l.t (, %s10) + store <512 x i1> %m, <512 x i1>* %mp, align 16 + ret void +} + +; Function Attrs: norecurse nounwind readonly +define fastcc void @storev512i1com(<512 x i1> %m) { +; CHECK-LABEL: storev512i1com: +; CHECK: # %bb.0: +; CHECK-NEXT: svm %s0, %vm2, 3 +; CHECK-NEXT: lea %s1, v512i1@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s1, v512i1@hi(, %s1) +; CHECK-NEXT: st %s0, 56(, %s1) +; CHECK-NEXT: svm %s0, %vm2, 2 +; CHECK-NEXT: st %s0, 48(, %s1) +; CHECK-NEXT: svm %s0, %vm2, 1 +; CHECK-NEXT: st %s0, 40(, %s1) +; CHECK-NEXT: svm %s0, %vm2, 0 +; CHECK-NEXT: st %s0, 32(, %s1) +; CHECK-NEXT: svm %s0, %vm3, 3 +; CHECK-NEXT: st %s0, 24(, %s1) +; CHECK-NEXT: svm %s0, %vm3, 2 +; CHECK-NEXT: st %s0, 16(, %s1) +; CHECK-NEXT: svm %s0, %vm3, 1 +; CHECK-NEXT: st %s0, 8(, %s1) +; CHECK-NEXT: svm %s0, %vm3, 0 +; CHECK-NEXT: st %s0, (, %s1) +; CHECK-NEXT: b.l.t (, %s10) + store <512 x i1> %m, <512 x i1>* @v512i1, align 16 + ret void +} diff --git a/llvm/test/CodeGen/VE/Vector/storevr.ll b/llvm/test/CodeGen/VE/Vector/storevr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/storevr.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +@v256i64 = common dso_local local_unnamed_addr global <256 x i64> zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind readonly +define fastcc void @storev256i64(<256 x i64>* nocapture, <256 x i64>) { +; CHECK-LABEL: storev256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vst %v0, 8, %s0 +; CHECK-NEXT: b.l.t (, %s10) + store <256 x i64> %1, <256 x i64>* %0, align 16 + ret void +} + +; Function Attrs: norecurse nounwind readonly +define fastcc void @storev256i64stk(<256 x i64>) { +; CHECK-LABEL: storev256i64stk: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s11, -2048(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lea %s1, (, %s11) +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vst %v0, 8, %s1 +; CHECK-NEXT: lea %s11, 2048(, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %addr = alloca <256 x i64>, align 16 + store <256 x i64> %0, <256 x i64>* %addr, align 16 + ret void +} + +; Function Attrs: norecurse nounwind readonly +define fastcc void @storev256i64com(<256 x i64>) { +; CHECK-LABEL: storev256i64com: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, v256i64@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s0, v256i64@hi(, %s0) +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vst %v0, 8, %s0 +; CHECK-NEXT: b.l.t (, %s10) + store <256 x i64> %0, <256 x i64>* @v256i64, align 16 + ret void +}