Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -567,6 +567,7 @@ setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8, @@ -8868,15 +8869,18 @@ DAG.getUNDEF(VT), NewMask.data()); } -/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and -/// NEON load/store intrinsics to merge base address updates. +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, +/// NEON load/store intrinsics, and generic vector load/stores, to merge +/// base address updates. +/// For generic load/stores, the memory type is assumed to be a vector. /// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || N->getOpcode() == ISD::INTRINSIC_W_CHAIN); - unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + bool isStore = N->getOpcode() == ISD::STORE; + unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); // Search for a use of the address operand that is an increment. @@ -8937,6 +8941,10 @@ case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; isLaneOp = false; break; + case ISD::STORE: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoad = false; isLaneOp = false; break; } } @@ -8944,8 +8952,11 @@ EVT VecTy; if (isLoad) VecTy = N->getValueType(0); - else + else if (isIntrinsic) VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + else + VecTy = N->getOperand(1).getValueType(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) NumBytes /= VecTy.getVectorNumElements(); @@ -8978,8 +8989,13 @@ Ops.push_back(N->getOperand(0)); // incoming chain Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); - for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { - Ops.push_back(N->getOperand(i)); + if (StoreSDNode *StN = dyn_cast(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + Ops.push_back(DAG.getConstant(StN->getAlignment(), MVT::i32)); + } else { + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) + Ops.push_back(N->getOperand(i)); } MemSDNode *MemInt = cast(N); SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, @@ -9121,6 +9137,17 @@ return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } +static SDValue PerformLOADCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // If this is a legal vector load, try to combine it into a VLD1_UPD. + if (VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + /// PerformSTORECombine - Target-specific dag combine xforms for /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, @@ -9261,6 +9288,10 @@ St->getAAInfo()); } + // If this is a legal vector store, try to combine it into a VST1_UPD. + if (VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + return SDValue(); } @@ -9852,6 +9883,7 @@ case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: Index: test/CodeGen/ARM/alloc-no-stack-realign.ll =================================================================== --- test/CodeGen/ARM/alloc-no-stack-realign.ll +++ test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -9,8 +9,8 @@ define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" { entry: ; NO-REALIGN-LABEL: test1 -; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1:[0-9]+]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 +; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] +; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] @@ -21,16 +21,14 @@ ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]! ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]! ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128] %retval = alloca <16 x float>, align 16 %0 = load <16 x float>* @T3_retval, align 16 @@ -44,8 +42,8 @@ entry: ; REALIGN-LABEL: test2 ; REALIGN: bic sp, sp, #63 -; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1:[0-9]+]]:128] -; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 +; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] +; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] @@ -65,8 +63,7 @@ ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] ; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #16 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]! ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128] %retval = alloca <16 x float>, align 16 %0 = load <16 x float>* @T3_retval, align 16 Index: test/CodeGen/ARM/memcpy-inline.ll =================================================================== --- test/CodeGen/ARM/memcpy-inline.ll +++ test/CodeGen/ARM/memcpy-inline.ll @@ -46,10 +46,8 @@ ; CHECK: movw [[REG2:r[0-9]+]], #16716 ; CHECK: movt [[REG2:r[0-9]+]], #72 ; CHECK: str [[REG2]], [r0, #32] -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] -; CHECK: adds r0, #16 -; CHECK: adds r1, #16 +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false) @@ -59,10 +57,8 @@ define void @t3(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t3: -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] -; CHECK: adds r0, #16 -; CHECK: adds r1, #16 +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! ; CHECK: vld1.8 {d{{[0-9]+}}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false) @@ -73,7 +69,8 @@ entry: ; CHECK-LABEL: t4: ; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1] -; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0] +; CHECK: vst1.64 {[[REG3]], [[REG4]]}, [r0]! +; CHECK: strh [[REG5:r[0-9]+]], [r0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false) ret void } Index: test/CodeGen/ARM/vector-load.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/vector-load.ll @@ -0,0 +1,184 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" +target triple = "thumbv7s-apple-ios8.0.0" + +define <8 x i8> @load_v8i8(<8 x i8>** %ptr) { +;CHECK-LABEL: load_v8i8: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <8 x i8>** %ptr + %lA = load <8 x i8>* %A, align 1 + ret <8 x i8> %lA +} + +define <8 x i8> @load_v8i8_update(<8 x i8>** %ptr) { +;CHECK-LABEL: load_v8i8_update: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i8>** %ptr + %lA = load <8 x i8>* %A, align 1 + %inc = getelementptr <8 x i8>* %A, i38 1 + store <8 x i8>* %inc, <8 x i8>** %ptr + ret <8 x i8> %lA +} + +define <4 x i16> @load_v4i16(<4 x i16>** %ptr) { +;CHECK-LABEL: load_v4i16: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x i16>** %ptr + %lA = load <4 x i16>* %A, align 1 + ret <4 x i16> %lA +} + +define <4 x i16> @load_v4i16_update(<4 x i16>** %ptr) { +;CHECK-LABEL: load_v4i16_update: +;CHECK: vld1.16 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i16>** %ptr + %lA = load <4 x i16>* %A, align 1 + %inc = getelementptr <4 x i16>* %A, i34 1 + store <4 x i16>* %inc, <4 x i16>** %ptr + ret <4 x i16> %lA +} + +define <2 x i32> @load_v2i32(<2 x i32>** %ptr) { +;CHECK-LABEL: load_v2i32: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x i32>** %ptr + %lA = load <2 x i32>* %A, align 1 + ret <2 x i32> %lA +} + +define <2 x i32> @load_v2i32_update(<2 x i32>** %ptr) { +;CHECK-LABEL: load_v2i32_update: +;CHECK: vld1.32 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i32>** %ptr + %lA = load <2 x i32>* %A, align 1 + %inc = getelementptr <2 x i32>* %A, i32 1 + store <2 x i32>* %inc, <2 x i32>** %ptr + ret <2 x i32> %lA +} + +define <2 x float> @load_v2f32(<2 x float>** %ptr) { +;CHECK-LABEL: load_v2f32: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x float>** %ptr + %lA = load <2 x float>* %A, align 1 + ret <2 x float> %lA +} + +define <2 x float> @load_v2f32_update(<2 x float>** %ptr) { +;CHECK-LABEL: load_v2f32_update: +;CHECK: vld1.32 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x float>** %ptr + %lA = load <2 x float>* %A, align 1 + %inc = getelementptr <2 x float>* %A, i32 1 + store <2 x float>* %inc, <2 x float>** %ptr + ret <2 x float> %lA +} + +define <1 x i64> @load_v1i64(<1 x i64>** %ptr) { +;CHECK-LABEL: load_v1i64: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <1 x i64>** %ptr + %lA = load <1 x i64>* %A, align 1 + ret <1 x i64> %lA +} + +define <1 x i64> @load_v1i64_update(<1 x i64>** %ptr) { +;CHECK-LABEL: load_v1i64_update: +;CHECK: vld1.64 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <1 x i64>** %ptr + %lA = load <1 x i64>* %A, align 1 + %inc = getelementptr <1 x i64>* %A, i31 1 + store <1 x i64>* %inc, <1 x i64>** %ptr + ret <1 x i64> %lA +} + +define <16 x i8> @load_v16i8(<16 x i8>** %ptr) { +;CHECK-LABEL: load_v16i8: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <16 x i8>** %ptr + %lA = load <16 x i8>* %A, align 1 + ret <16 x i8> %lA +} + +define <16 x i8> @load_v16i8_update(<16 x i8>** %ptr) { +;CHECK-LABEL: load_v16i8_update: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <16 x i8>** %ptr + %lA = load <16 x i8>* %A, align 1 + %inc = getelementptr <16 x i8>* %A, i316 1 + store <16 x i8>* %inc, <16 x i8>** %ptr + ret <16 x i8> %lA +} + +define <8 x i16> @load_v8i16(<8 x i16>** %ptr) { +;CHECK-LABEL: load_v8i16: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <8 x i16>** %ptr + %lA = load <8 x i16>* %A, align 1 + ret <8 x i16> %lA +} + +define <8 x i16> @load_v8i16_update(<8 x i16>** %ptr) { +;CHECK-LABEL: load_v8i16_update: +;CHECK: vld1.16 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i16>** %ptr + %lA = load <8 x i16>* %A, align 1 + %inc = getelementptr <8 x i16>* %A, i38 1 + store <8 x i16>* %inc, <8 x i16>** %ptr + ret <8 x i16> %lA +} + +define <4 x i32> @load_v4i32(<4 x i32>** %ptr) { +;CHECK-LABEL: load_v4i32: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x i32>** %ptr + %lA = load <4 x i32>* %A, align 1 + ret <4 x i32> %lA +} + +define <4 x i32> @load_v4i32_update(<4 x i32>** %ptr) { +;CHECK-LABEL: load_v4i32_update: +;CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i32>** %ptr + %lA = load <4 x i32>* %A, align 1 + %inc = getelementptr <4 x i32>* %A, i34 1 + store <4 x i32>* %inc, <4 x i32>** %ptr + ret <4 x i32> %lA +} + +define <4 x float> @load_v4f32(<4 x float>** %ptr) { +;CHECK-LABEL: load_v4f32: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x float>** %ptr + %lA = load <4 x float>* %A, align 1 + ret <4 x float> %lA +} + +define <4 x float> @load_v4f32_update(<4 x float>** %ptr) { +;CHECK-LABEL: load_v4f32_update: +;CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x float>** %ptr + %lA = load <4 x float>* %A, align 1 + %inc = getelementptr <4 x float>* %A, i34 1 + store <4 x float>* %inc, <4 x float>** %ptr + ret <4 x float> %lA +} + +define <2 x i64> @load_v2i64(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 1 + ret <2 x i64> %lA +} + +define <2 x i64> @load_v2i64_update(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64_update: +;CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 1 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret <2 x i64> %lA +} Index: test/CodeGen/ARM/vector-store.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/vector-store.ll @@ -0,0 +1,184 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" +target triple = "thumbv7s-apple-ios8.0.0" + +define void @store_v8i8(<8 x i8>** %ptr, <8 x i8> %val) { +;CHECK-LABEL: store_v8i8: +;CHECK: str r1, [r0] + %A = load <8 x i8>** %ptr + store <8 x i8> %val, <8 x i8>* %A, align 1 + ret void +} + +define void @store_v8i8_update(<8 x i8>** %ptr, <8 x i8> %val) { +;CHECK-LABEL: store_v8i8_update: +;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i8>** %ptr + store <8 x i8> %val, <8 x i8>* %A, align 1 + %inc = getelementptr <8 x i8>* %A, i38 1 + store <8 x i8>* %inc, <8 x i8>** %ptr + ret void +} + +define void @store_v4i16(<4 x i16>** %ptr, <4 x i16> %val) { +;CHECK-LABEL: store_v4i16: +;CHECK: str r1, [r0] + %A = load <4 x i16>** %ptr + store <4 x i16> %val, <4 x i16>* %A, align 1 + ret void +} + +define void @store_v4i16_update(<4 x i16>** %ptr, <4 x i16> %val) { +;CHECK-LABEL: store_v4i16_update: +;CHECK: vst1.16 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i16>** %ptr + store <4 x i16> %val, <4 x i16>* %A, align 1 + %inc = getelementptr <4 x i16>* %A, i34 1 + store <4 x i16>* %inc, <4 x i16>** %ptr + ret void +} + +define void @store_v2i32(<2 x i32>** %ptr, <2 x i32> %val) { +;CHECK-LABEL: store_v2i32: +;CHECK: str r1, [r0] + %A = load <2 x i32>** %ptr + store <2 x i32> %val, <2 x i32>* %A, align 1 + ret void +} + +define void @store_v2i32_update(<2 x i32>** %ptr, <2 x i32> %val) { +;CHECK-LABEL: store_v2i32_update: +;CHECK: vst1.32 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i32>** %ptr + store <2 x i32> %val, <2 x i32>* %A, align 1 + %inc = getelementptr <2 x i32>* %A, i32 1 + store <2 x i32>* %inc, <2 x i32>** %ptr + ret void +} + +define void @store_v2f32(<2 x float>** %ptr, <2 x float> %val) { +;CHECK-LABEL: store_v2f32: +;CHECK: str r1, [r0] + %A = load <2 x float>** %ptr + store <2 x float> %val, <2 x float>* %A, align 1 + ret void +} + +define void @store_v2f32_update(<2 x float>** %ptr, <2 x float> %val) { +;CHECK-LABEL: store_v2f32_update: +;CHECK: vst1.32 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x float>** %ptr + store <2 x float> %val, <2 x float>* %A, align 1 + %inc = getelementptr <2 x float>* %A, i32 1 + store <2 x float>* %inc, <2 x float>** %ptr + ret void +} + +define void @store_v1i64(<1 x i64>** %ptr, <1 x i64> %val) { +;CHECK-LABEL: store_v1i64: +;CHECK: str r1, [r0] + %A = load <1 x i64>** %ptr + store <1 x i64> %val, <1 x i64>* %A, align 1 + ret void +} + +define void @store_v1i64_update(<1 x i64>** %ptr, <1 x i64> %val) { +;CHECK-LABEL: store_v1i64_update: +;CHECK: vst1.64 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <1 x i64>** %ptr + store <1 x i64> %val, <1 x i64>* %A, align 1 + %inc = getelementptr <1 x i64>* %A, i31 1 + store <1 x i64>* %inc, <1 x i64>** %ptr + ret void +} + +define void @store_v16i8(<16 x i8>** %ptr, <16 x i8> %val) { +;CHECK-LABEL: store_v16i8: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <16 x i8>** %ptr + store <16 x i8> %val, <16 x i8>* %A, align 1 + ret void +} + +define void @store_v16i8_update(<16 x i8>** %ptr, <16 x i8> %val) { +;CHECK-LABEL: store_v16i8_update: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <16 x i8>** %ptr + store <16 x i8> %val, <16 x i8>* %A, align 1 + %inc = getelementptr <16 x i8>* %A, i316 1 + store <16 x i8>* %inc, <16 x i8>** %ptr + ret void +} + +define void @store_v8i16(<8 x i16>** %ptr, <8 x i16> %val) { +;CHECK-LABEL: store_v8i16: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <8 x i16>** %ptr + store <8 x i16> %val, <8 x i16>* %A, align 1 + ret void +} + +define void @store_v8i16_update(<8 x i16>** %ptr, <8 x i16> %val) { +;CHECK-LABEL: store_v8i16_update: +;CHECK: vst1.16 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i16>** %ptr + store <8 x i16> %val, <8 x i16>* %A, align 1 + %inc = getelementptr <8 x i16>* %A, i38 1 + store <8 x i16>* %inc, <8 x i16>** %ptr + ret void +} + +define void @store_v4i32(<4 x i32>** %ptr, <4 x i32> %val) { +;CHECK-LABEL: store_v4i32: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x i32>** %ptr + store <4 x i32> %val, <4 x i32>* %A, align 1 + ret void +} + +define void @store_v4i32_update(<4 x i32>** %ptr, <4 x i32> %val) { +;CHECK-LABEL: store_v4i32_update: +;CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i32>** %ptr + store <4 x i32> %val, <4 x i32>* %A, align 1 + %inc = getelementptr <4 x i32>* %A, i34 1 + store <4 x i32>* %inc, <4 x i32>** %ptr + ret void +} + +define void @store_v4f32(<4 x float>** %ptr, <4 x float> %val) { +;CHECK-LABEL: store_v4f32: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x float>** %ptr + store <4 x float> %val, <4 x float>* %A, align 1 + ret void +} + +define void @store_v4f32_update(<4 x float>** %ptr, <4 x float> %val) { +;CHECK-LABEL: store_v4f32_update: +;CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x float>** %ptr + store <4 x float> %val, <4 x float>* %A, align 1 + %inc = getelementptr <4 x float>* %A, i34 1 + store <4 x float>* %inc, <4 x float>** %ptr + ret void +} + +define void @store_v2i64(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 1 + ret void +} + +define void @store_v2i64_update(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64_update: +;CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 1 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret void +}