Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13181,7 +13181,7 @@ } bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, - unsigned, + unsigned Alignment, MachineMemOperand::Flags, bool *Fast) const { // Depends what it gets converted into if the type is weird. @@ -13190,23 +13190,18 @@ // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); + auto Ty = VT.getSimpleVT().SimpleTy; - switch (VT.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::i8: - case MVT::i16: - case MVT::i32: { + if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { // Unaligned access can use (for example) LRDB, LRDH, LDR if (AllowsUnaligned) { if (Fast) *Fast = Subtarget->hasV7Ops(); return true; } - return false; } - case MVT::f64: - case MVT::v2f64: { + + if (Ty == MVT::f64 || Ty == MVT::v2f64) { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses @@ -13215,9 +13210,52 @@ *Fast = true; return true; } - return false; } + + if (!Subtarget->hasMVEIntegerOps()) + return false; + if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && + Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && + Ty != MVT::v2f64) + return false; + + if (Subtarget->isLittle()) { + // In little-endian MVE, the store instructions VSTRB.U8, + // VSTRH.U16 and VSTRW.U32 all store the vector register in + // exactly the same format, and differ only in the range of + // their immediate offset field and the required alignment. + // + // In particular, VSTRB.U8 can store a vector at byte alignment. + // So at this stage we can simply say that loads/stores of all + // 128-bit wide vector types are permitted at any alignment, + // because we know at least _one_ instruction can manage that. + // + // Later on we might find that some of those loads are better + // generated as VLDRW.U32 if alignment permits, to take + // advantage of the larger immediate range. But for the moment, + // all that matters is that if we don't lower the load then + // _some_ instruction can handle it. + if (Fast) + *Fast = true; + return true; + } else { + // In big-endian MVE, those instructions aren't so similar + // after all, because they reorder the bytes of the vector + // differently. So this time we can only store a particular + // kind of vector if its alignment is at least the element + // type. And we can't store vectors of i64 or f64 at all + // without having to do some postprocessing, because there's + // no VSTRD.U64. + if (Ty == MVT::v16i8 || + ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) || + ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) { + if (Fast) + *Fast = true; + return true; + } } + + return false; } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4239,12 +4239,14 @@ def : MVE_unpred_vector_store_typed; def : MVE_unpred_vector_store_typed; def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; } class MVE_unpred_vector_load_typed : Pat<(Ty (LoadKind t2addrmode_imm7:$addr)), (Ty (RegImmInst t2addrmode_imm7:$addr))>; + multiclass MVE_unpred_vector_load { def : MVE_unpred_vector_load_typed; @@ -4253,6 +4255,7 @@ def : MVE_unpred_vector_load_typed; def : MVE_unpred_vector_load_typed; def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; } let Predicates = [HasMVEInt, IsLE] in { Index: llvm/test/CodeGen/Thumb2/mve-ldst-offset.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-ldst-offset.ll @@ -0,0 +1,1469 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define i8* @post_ldrwu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwu32_m4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #-4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -4 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwu32_508(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r2, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwu32_512(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r2, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwu32_m508(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r2, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwu32_m512(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r2, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %x +} + + +define i8* @post_ldrhu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #8] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #10] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh.w r2, [r0, #3] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #5] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #7] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #9] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhu32_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #8] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhu32_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh.w r2, [r0, #254] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #256] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #258] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #260] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhu32_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh.w r2, [r0, #256] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #258] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #260] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #262] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + + +define i8* @post_ldrhs32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0, #4] +; CHECK-NEXT: ldrsh.w r3, [r0, #6] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #8] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #10] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhs32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0, #3] +; CHECK-NEXT: ldrsh.w r3, [r0, #5] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #7] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #9] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhs32_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0, #2] +; CHECK-NEXT: ldrsh.w r3, [r0, #4] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #6] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #8] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhs32_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0, #254] +; CHECK-NEXT: ldrsh.w r3, [r0, #256] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #258] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #260] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrhs32_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0, #256] +; CHECK-NEXT: ldrsh.w r3, [r0, #258] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #260] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #262] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + + +define i8* @post_ldrhu16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrhu16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrhu16_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r2, r0, #2 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrhu16_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r2, r0, #254 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrhu16_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r2, r0, #256 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %x +} + + +define i8* @post_ldrbu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #4] +; CHECK-NEXT: ldrb r3, [r0, #5] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #6] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #7] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #3] +; CHECK-NEXT: ldrb r3, [r0, #4] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #5] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbu32_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb.w r2, [r0, #127] +; CHECK-NEXT: ldrb.w r3, [r0, #128] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #129] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #130] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbu32_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb.w r2, [r0, #128] +; CHECK-NEXT: ldrb.w r3, [r0, #129] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #130] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #131] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + + +define i8* @post_ldrbs32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbs32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbs32_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #127] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #128] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #129] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #130] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbs32_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #128] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #129] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #130] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #131] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %x +} + + +define i8* @post_ldrbu16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #4] +; CHECK-NEXT: ldrb r3, [r0, #5] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #6] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #7] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb r2, [r0, #8] +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrb r2, [r0, #9] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #10] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #11] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbu16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #3] +; CHECK-NEXT: ldrb r3, [r0, #4] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #5] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #6] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrb r2, [r0, #8] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #9] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #10] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbu16_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb.w r2, [r0, #127] +; CHECK-NEXT: ldrb.w r3, [r0, #128] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #129] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #130] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb.w r2, [r0, #131] +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrb.w r2, [r0, #132] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb.w r2, [r0, #133] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb.w r2, [r0, #134] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbu16_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb.w r2, [r0, #128] +; CHECK-NEXT: ldrb.w r3, [r0, #129] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #130] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #131] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb.w r2, [r0, #132] +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrb.w r2, [r0, #133] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb.w r2, [r0, #134] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb.w r2, [r0, #135] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + + +define i8* @post_ldrbs16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #8] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #9] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #10] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #11] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbs16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #8] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #9] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #10] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbs16_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #127] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #128] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #129] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #130] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #131] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #132] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #133] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #134] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + +define i8* @post_ldrbs16_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0, #128] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #129] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #130] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #131] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #132] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #133] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #134] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #135] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %x +} + + +define i8* @post_ldrbu8_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrbu8_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrbu8_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r2, r0, #127 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrbu8_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #128] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwf32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 8 + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 8 + ret i8* %x +} + +define i8* @post_ldrwf16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 8 + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 8 + ret i8* %x +} + + + + + +define i8* @post_strw32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %y +} + +define i8* @post_strw32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %y +} + +define i8* @post_strw32_m4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #-4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %y +} + +define i8* @post_strw32_508(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %y +} + +define i8* @post_strw32_512(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %y +} + +define i8* @post_strw32_m508(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %y +} + +define i8* @post_strw32_m512(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %y +} + + +define i8* @post_strh32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str.w r1, [r0, #3] +; CHECK-NEXT: str.w r2, [r0, #7] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh32_2(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str.w r1, [r0, #2] +; CHECK-NEXT: str.w r2, [r0, #6] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh32_254(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str.w r1, [r0, #254] +; CHECK-NEXT: str.w r2, [r0, #258] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh32_256(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0, #256] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %y +} + + +define i8* @post_strh16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh16_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh16_2(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh16_254(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %y +} + +define i8* @post_strh16_256(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #256 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %y +} + + +define i8* @post_strb32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str.w r1, [r0, #3] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb32_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str.w r1, [r0, #127] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb32_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str.w r1, [r0, #128] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %y +} + + +define i8* @post_strb16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb16_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str.w r1, [r0, #3] +; CHECK-NEXT: str.w r2, [r0, #7] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb16_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str.w r1, [r0, #127] +; CHECK-NEXT: str.w r2, [r0, #131] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb16_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0, #128] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %y +} + + +define i8* @post_strb8_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb8_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb8_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strb8_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #128] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %y +} + +define i8* @post_strf32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 8 + %2 = bitcast i8* %z to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 8 + ret i8* %y +} + +define i8* @post_strf16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 8 + %2 = bitcast i8* %z to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 8 + ret i8* %y +} Index: llvm/test/CodeGen/Thumb2/mve-ldst-postinc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-ldst-postinc.ll @@ -0,0 +1,1509 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define i8* @post_ldrwu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_m4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_508(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_512(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_m508(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_m512(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + + +define i8* @post_ldrhu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: ldrh r3, [r2, #4]! +; CHECK-NEXT: ldrh r0, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: ldrh r3, [r2, #2]! +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrh r3, [r0, #4] +; CHECK-NEXT: ldrh r0, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrhs32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsh.w r2, [r0] +; CHECK-NEXT: ldrsh.w r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #6] +; CHECK-NEXT: ldrsh r2, [r0, #4]! +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0] +; CHECK-NEXT: ldrsh.w r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsh.w r2, [r0] +; CHECK-NEXT: ldrsh.w r3, [r0, #4] +; CHECK-NEXT: ldrsh.w r12, [r0, #6] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh r2, [r0, #2]! +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0] +; CHECK-NEXT: ldrsh.w r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0] +; CHECK-NEXT: ldrsh.w r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrhu16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + + +define i8* @post_ldrbu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #3] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb r2, [r0, #3]! +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu32_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #3] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu32_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #3] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbs32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb r2, [r0, #3]! +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs32_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs32_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbu16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: ldrb.w lr, [r0, #3] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb r3, [r2, #4]! +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: ldrb r3, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: ldrb r3, [r0, #6] +; CHECK-NEXT: ldrb r0, [r0, #7] +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb r3, [r2, #3]! +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb.w lr, [r0, #4] +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: ldrb r3, [r0, #5] +; CHECK-NEXT: vmov.16 q0[4], lr +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: ldrb r3, [r0, #6] +; CHECK-NEXT: ldrb r0, [r0, #7] +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu16_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #3] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrb r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu16_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #3] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: ldrb r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrb r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbs16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: ldrsb r3, [r2, #4]! +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: ldrsb.w r3, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: ldrsb.w r3, [r0, #6] +; CHECK-NEXT: ldrsb.w r0, [r0, #7] +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: ldrsb r3, [r2, #3]! +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: ldrsb.w r3, [r0, #4] +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: ldrsb.w r3, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: ldrsb.w r3, [r0, #6] +; CHECK-NEXT: ldrsb.w r0, [r0, #7] +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs16_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs16_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb.w r2, [r0] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbu8_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrbu8_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrbu8_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrbu8_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwf32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 8 + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwf16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 8 + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 8 + ret i8* %z +} + + + + + +define i8* @post_strw32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_m4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_508(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_512(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_m508(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_m512(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strh32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: str r2, [r0, #4]! +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %y to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %y to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_2(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %y to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_254(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %y to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_256(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %y to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strh16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_2(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_254(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_256(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strb32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0], #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %y to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0], #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %y to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb32_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0], #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %y to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb32_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0], #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %y to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strb16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: str r2, [r0, #4]! +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %y to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb16_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %y to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb16_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %y to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb16_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %y to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strb8_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb8_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb8_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb8_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strf32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 8 + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 8 + ret i8* %z +} + +define i8* @post_strf16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 8 + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 8 + ret i8* %z +} Index: llvm/test/CodeGen/Thumb2/mve-ldst-preinc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-ldst-preinc.ll @@ -0,0 +1,1473 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define i8* @post_ldrwu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_m4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #-4] +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -4 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_508(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_512(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_m508(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwu32_m512(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwu32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %z to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + + +define i8* @post_ldrhu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0, #4]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0, #3]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0, #2]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r2, [r0, #254]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhu32_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh.w r2, [r0, #256] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #258] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #260] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrh.w r2, [r0, #262] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrhs32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsh r2, [r0, #4]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r2, [r0, #2] +; CHECK-NEXT: ldrsh.w r3, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsh r2, [r0, #3]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r2, [r0, #2] +; CHECK-NEXT: ldrsh.w r3, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsh r2, [r0, #2]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r2, [r0, #2] +; CHECK-NEXT: ldrsh.w r3, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsh r2, [r0, #254]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r2, [r0, #2] +; CHECK-NEXT: ldrsh.w r3, [r0, #4] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrhs32_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhs32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrsh.w r2, [r0, #256] +; CHECK-NEXT: ldrsh.w r3, [r0, #258] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsh.w r12, [r0, #260] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldrsh.w lr, [r0, #262] +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrhu16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_2(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_254(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrhu16_256(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrhu16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + + +define i8* @post_ldrbu32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrb r2, [r0, #4]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrb r2, [r0, #3]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu32_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrb r2, [r0, #127]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu32_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrb r2, [r0, #128]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbs32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #4]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs32_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #3]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs32_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #127]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs32_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #128]! +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbu16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #4]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #4] +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: ldrb r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[4], lr +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #3]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #4] +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: ldrb r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[4], lr +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu16_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #127]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #4] +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: ldrb r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[4], lr +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbu16_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrb r2, [r0, #128]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #3] +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: ldrb.w lr, [r0, #4] +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: ldrb r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[4], lr +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbs16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #4]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs16_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #3]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs16_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #127]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + +define i8* @post_ldrbs16_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbs16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrsb r2, [r0, #128]! +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 8 + ret i8* %z +} + + +define i8* @post_ldrbu8_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrbu8_3(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrbu8_127(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrbu8_128(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrbu8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #128] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwf32_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 8 + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 8 + ret i8* %z +} + +define i8* @post_ldrwf16_4(i8* %x, i8* %y) { +; CHECK-LABEL: post_ldrwf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 8 + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 8 + ret i8* %z +} + + + + + +define i8* @post_strw32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_m4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #-4] +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_508(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_512(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_m508(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + +define i8* @post_strw32_m512(i8* %y, i8* %x) { +; CHECK-LABEL: post_strw32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 8 + %2 = bitcast i8* %z to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strh32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #4]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #3]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_2(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #2]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_254(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #254]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh32_256(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: strd r1, r2, [r0, #256] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 8 + %2 = bitcast i8* %z to <4 x i16>* + store <4 x i16> %1, <4 x i16>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strh16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_2(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_254(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + +define i8* @post_strh16_256(i8* %y, i8* %x) { +; CHECK-LABEL: post_strh16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 8 + %2 = bitcast i8* %z to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strb32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0, #4]! +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb32_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0, #3]! +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb32_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0, #127]! +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb32_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r1, [r0, #128]! +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 8 + %2 = bitcast i8* %z to <4 x i8>* + store <4 x i8> %1, <4 x i8>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strb16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #4]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb16_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #3]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb16_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #127]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb16_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r2, [r1] +; CHECK-NEXT: str r1, [r0, #128]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = bitcast i8* %z to <8 x i8>* + store <8 x i8> %1, <8 x i8>* %2, align 8 + ret i8* %z +} + + +define i8* @post_strb8_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb8_3(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb8_127(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strb8_128(i8* %y, i8* %x) { +; CHECK-LABEL: post_strb8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #128] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 8 + %2 = bitcast i8* %z to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 8 + ret i8* %z +} + +define i8* @post_strf32_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 8 + %2 = bitcast i8* %z to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 8 + ret i8* %z +} + +define i8* @post_strf16_4(i8* %y, i8* %x) { +; CHECK-LABEL: post_strf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 8 + %2 = bitcast i8* %z to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 8 + ret i8* %z +} Index: llvm/test/CodeGen/Thumb2/mve-ldst-regimm.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-ldst-regimm.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +%struct.s_int8_t = type { [16 x i8], [16 x i8] } +%struct.s_int16_t = type { [8 x i16], [8 x i16] } +%struct.s_int32_t = type { [4 x i32], [4 x i32] } +%struct.s_float16_t = type { [8 x half], [8 x half] } +%struct.s_float32_t = type { [4 x float], [4 x float] } + +define hidden void @fwd_int8_t(%struct.s_int8_t* noalias %v) local_unnamed_addr #0 { +; CHECK-LABEL: fwd_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vstrb.8 q0, [r0, #16] +; CHECK-NEXT: bx lr +entry: + %arrayidx3 = getelementptr inbounds %struct.s_int8_t, %struct.s_int8_t* %v, i32 0, i32 1, i32 0 + %0 = bitcast %struct.s_int8_t* %v to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %arrayidx3 to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret void +} + +define hidden void @fwd_int16_t(%struct.s_int16_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: fwd_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r0, #16] +; CHECK-NEXT: bx lr +entry: + %arrayidx3 = getelementptr inbounds %struct.s_int16_t, %struct.s_int16_t* %v, i32 0, i32 1, i32 0 + %0 = bitcast %struct.s_int16_t* %v to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i16* %arrayidx3 to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret void +} + +define hidden void @fwd_int32_t(%struct.s_int32_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: fwd_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-NEXT: bx lr +entry: + %arrayidx3 = getelementptr inbounds %struct.s_int32_t, %struct.s_int32_t* %v, i32 0, i32 1, i32 0 + %0 = bitcast %struct.s_int32_t* %v to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i32* %arrayidx3 to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret void +} + +define hidden void @fwd_float16_t(%struct.s_float16_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: fwd_float16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r0, #16] +; CHECK-NEXT: bx lr +entry: + %arrayidx3 = getelementptr inbounds %struct.s_float16_t, %struct.s_float16_t* %v, i32 0, i32 1, i32 0 + %0 = bitcast %struct.s_float16_t* %v to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast half* %arrayidx3 to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret void +} + +define hidden void @fwd_float32_t(%struct.s_float32_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: fwd_float32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-NEXT: bx lr +entry: + %d = getelementptr inbounds %struct.s_float32_t, %struct.s_float32_t* %v, i32 0, i32 1 + %0 = bitcast %struct.s_float32_t* %v to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast [4 x float]* %d to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret void +} + +define hidden void @bwd_int8_t(%struct.s_int8_t* noalias %v) local_unnamed_addr #0 { +; CHECK-LABEL: bwd_int8_t: +; CHECK: @ %bb.0: @ %for.end +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vstrb.8 q0, [r0, #-16] +; CHECK-NEXT: bx lr +for.end: + %0 = bitcast %struct.s_int8_t* %v to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %arrayidx3 = getelementptr inbounds %struct.s_int8_t, %struct.s_int8_t* %v, i32 -1, i32 1, i32 0 + %2 = bitcast i8* %arrayidx3 to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret void +} + +define hidden void @bwd_int16_t(%struct.s_int16_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: bwd_int16_t: +; CHECK: @ %bb.0: @ %for.end +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r0, #-16] +; CHECK-NEXT: bx lr +for.end: + %0 = bitcast %struct.s_int16_t* %v to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %arrayidx3 = getelementptr inbounds %struct.s_int16_t, %struct.s_int16_t* %v, i32 -1, i32 1, i32 0 + %2 = bitcast i16* %arrayidx3 to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret void +} + +define hidden void @bwd_int32_t(%struct.s_int32_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: bwd_int32_t: +; CHECK: @ %bb.0: @ %for.end +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, #-16] +; CHECK-NEXT: bx lr +for.end: + %0 = bitcast %struct.s_int32_t* %v to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %arrayidx3 = getelementptr inbounds %struct.s_int32_t, %struct.s_int32_t* %v, i32 -1, i32 1, i32 0 + %2 = bitcast i32* %arrayidx3 to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret void +} + +define hidden void @bwd_float16_t(%struct.s_float16_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: bwd_float16_t: +; CHECK: @ %bb.0: @ %for.end +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r0, #-16] +; CHECK-NEXT: bx lr +for.end: + %0 = bitcast %struct.s_float16_t* %v to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 2 + %arrayidx3 = getelementptr inbounds %struct.s_float16_t, %struct.s_float16_t* %v, i32 -1, i32 1, i32 0 + %2 = bitcast half* %arrayidx3 to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret void +} + +define hidden void @bwd_float32_t(%struct.s_float32_t* noalias nocapture %v) local_unnamed_addr #0 { +; CHECK-LABEL: bwd_float32_t: +; CHECK: @ %bb.0: @ %for.end +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, #-16] +; CHECK-NEXT: bx lr +for.end: + %0 = bitcast %struct.s_float32_t* %v to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %d = getelementptr inbounds %struct.s_float32_t, %struct.s_float32_t* %v, i32 -1, i32 1 + %2 = bitcast [4 x float]* %d to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret void +} Index: llvm/test/CodeGen/Thumb2/mve-loadstore.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-loadstore.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) { +; CHECK-LABEL: load_4xi32_a4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %vp, align 4 + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) { +; CHECK-LABEL: load_4xi32_a2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %vp, align 2 + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) { +; CHECK-LABEL: load_4xi32_a1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %vp, align 1 + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) { +; CHECK-LABEL: store_4xi32_a4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + store <4 x i32> %val, <4 x i32>* %vp, align 4 + ret void +} + +define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) { +; CHECK-LABEL: store_4xi32_a2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + store <4 x i32> %val, <4 x i32>* %vp, align 2 + ret void +} + +define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) { +; CHECK-LABEL: store_4xi32_a1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + store <4 x i32> %val, <4 x i32>* %vp, align 1 + ret void +} + +define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) { +; CHECK-LABEL: load_4xi32_a4_offset_pos: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %ipoffset = getelementptr inbounds i32, i32* %ip, i32 127 + %vp = bitcast i32* %ipoffset to <4 x i32>* + %0 = load <4 x i32>, <4 x i32>* %vp, align 4 + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) { +; CHECK-LABEL: load_4xi32_a4_offset_neg: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127 + %vp = bitcast i32* %ipoffset to <4 x i32>* + %0 = load <4 x i32>, <4 x i32>* %vp, align 4 + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() { +; CHECK-LABEL: loadstore_4xi32_stack_off16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: movs r0, #3 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] +; CHECK-NEXT: str r0, [sp, #16] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: bx lr +entry: + %c = alloca [1 x [5 x [2 x i32]]], align 4 + %0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8* + %arrayidx5 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 0, i32 0 + %1 = bitcast [1 x [5 x [2 x i32]]]* %c to <4 x i32>* + store <4 x i32> , <4 x i32>* %1, align 4 + %arrayidx5.2 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 2, i32 0 + %2 = bitcast i32* %arrayidx5.2 to <4 x i32>* + store <4 x i32> , <4 x i32>* %2, align 4 + store i32 3, i32* %arrayidx5.2, align 4 + %3 = load <4 x i32>, <4 x i32>* %2, align 4 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() { +; CHECK-LABEL: loadstore_8xi16_stack_off16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: movs r0, #3 +; CHECK-NEXT: vstrh.16 q0, [sp, #16] +; CHECK-NEXT: strh.w r0, [sp, #16] +; CHECK-NEXT: vldrh.u16 q0, [sp, #16] +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: bx lr +entry: + %c = alloca [1 x [10 x [2 x i16]]], align 2 + %0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8* + %arrayidx5 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 0, i32 0 + %1 = bitcast [1 x [10 x [2 x i16]]]* %c to <8 x i16>* + store <8 x i16> , <8 x i16>* %1, align 2 + %arrayidx5.2 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 4, i32 0 + %2 = bitcast i16* %arrayidx5.2 to <8 x i16>* + store <8 x i16> , <8 x i16>* %2, align 2 + store i16 3, i16* %arrayidx5.2, align 2 + %3 = load <8 x i16>, <8 x i16>* %2, align 2 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() { +; CHECK-LABEL: loadstore_16xi8_stack_off16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vdup.8 q0, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: movs r0, #3 +; CHECK-NEXT: vstrb.8 q0, [sp, #16] +; CHECK-NEXT: strb.w r0, [sp, #16] +; CHECK-NEXT: vldrb.u8 q0, [sp, #16] +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: bx lr +entry: + %c = alloca [1 x [20 x [2 x i8]]], align 1 + %0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8* + %arrayidx5 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 0, i32 0 + %1 = bitcast [1 x [20 x [2 x i8]]]* %c to <16 x i8>* + store <16 x i8> , <16 x i8>* %1, align 1 + %arrayidx5.2 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 8, i32 0 + %2 = bitcast i8* %arrayidx5.2 to <16 x i8>* + store <16 x i8> , <16 x i8>* %2, align 1 + store i8 3, i8* %arrayidx5.2, align 1 + %3 = load <16 x i8>, <16 x i8>* %2, align 1 + ret <16 x i8> %3 +}