Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.h +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h @@ -814,6 +814,7 @@ MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; void addMVEVectorTypes(bool HasMVEFP); + void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action); void setAllExpand(MVT VT); }; Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -226,6 +226,13 @@ setOperationAction(Opc, VT, Expand); } +void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, + LegalizeAction Action) { + setLoadExtAction(ISD::EXTLOAD, From, To, Action); + setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); + setLoadExtAction(ISD::SEXTLOAD, From, To, Action); +} + void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; @@ -277,6 +284,16 @@ setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); } + + // It is legal to extload from v4i8 to v4i16 or v4i32. + addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); + addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); + addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); + + // Some truncating stores are legal too. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, @@ -587,9 +604,7 @@ for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + addAllExtLoads(VT, InnerVT, Expand); } setOperationAction(ISD::MULHS, VT, Expand); @@ -13197,7 +13212,9 @@ return false; if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && - Ty != MVT::v2f64) + Ty != MVT::v2f64 && + // These are for truncated stores + Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16) return false; if (Subtarget->isLittle()) { Index: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td +++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td @@ -4201,6 +4201,42 @@ def : MVE_unpred_vector_load_typed; } + +// Widening/Narrowing Loads/Stores + +let Predicates = [HasMVEInt] in { + def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr), + (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>; + def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr), + (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>; + def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr), + (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>; +} + +multiclass MVEExtLoad { + def _Any : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) + (!cast("extloadvi" # SrcElemBits) am:$addr)), + (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) + am:$addr)>; + def _Z : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) + (!cast("zextloadvi" # SrcElemBits) am:$addr)), + (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) + am:$addr)>; + def _S : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) + (!cast("sextloadvi" # SrcElemBits) am:$addr)), + (!cast("MVE_VLDR" # SrcElemType # "S" # DestElemBits) + am:$addr)>; +} + +let Predicates = [HasMVEInt] in { + defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>; + defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>; + defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>; +} + + // Bit convert patterns let Predicates = [HasMVEInt] in { Index: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll @@ -115,14 +115,7 @@ define i8* @post_ldrhu32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #8] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #10] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrh.u32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -138,14 +131,8 @@ define i8* @post_ldrhu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh.w r2, [r0, #3] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #5] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #7] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #9] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrh.u32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -161,14 +148,8 @@ define i8* @post_ldrhu32_2(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #8] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r2, r0, #2 +; CHECK-NEXT: vldrh.u32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -184,14 +165,8 @@ define i8* @post_ldrhu32_254(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh.w r2, [r0, #254] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #256] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #258] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #260] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: add.w r2, r0, #254 +; CHECK-NEXT: vldrh.u32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -207,14 +182,8 @@ define i8* @post_ldrhu32_256(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh.w r2, [r0, #256] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #258] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #260] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #262] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: add.w r2, r0, #256 +; CHECK-NEXT: vldrh.u32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -231,18 +200,9 @@ define i8* @post_ldrhs32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0, #4] -; CHECK-NEXT: ldrsh.w r3, [r0, #6] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #8] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #10] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vldrh.s32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 4 %0 = bitcast i8* %z to <4 x i16>* @@ -256,18 +216,10 @@ define i8* @post_ldrhs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0, #3] -; CHECK-NEXT: ldrsh.w r3, [r0, #5] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #7] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #9] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 %0 = bitcast i8* %z to <4 x i16>* @@ -281,18 +233,10 @@ define i8* @post_ldrhs32_2(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0, #2] -; CHECK-NEXT: ldrsh.w r3, [r0, #4] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #6] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #8] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: adds r2, r0, #2 +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 2 %0 = bitcast i8* %z to <4 x i16>* @@ -306,18 +250,10 @@ define i8* @post_ldrhs32_254(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0, #254] -; CHECK-NEXT: ldrsh.w r3, [r0, #256] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #258] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #260] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: add.w r2, r0, #254 +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 254 %0 = bitcast i8* %z to <4 x i16>* @@ -331,18 +267,10 @@ define i8* @post_ldrhs32_256(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0, #256] -; CHECK-NEXT: ldrsh.w r3, [r0, #258] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #260] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #262] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: add.w r2, r0, #256 +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 256 %0 = bitcast i8* %z to <4 x i16>* @@ -437,18 +365,9 @@ define i8* @post_ldrbu32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #4] -; CHECK-NEXT: ldrb r3, [r0, #5] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #6] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #7] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 4 %0 = bitcast i8* %z to <4 x i8>* @@ -462,18 +381,10 @@ define i8* @post_ldrbu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #3] -; CHECK-NEXT: ldrb r3, [r0, #4] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #5] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrb.u32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 %0 = bitcast i8* %z to <4 x i8>* @@ -487,18 +398,10 @@ define i8* @post_ldrbu32_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb.w r2, [r0, #127] -; CHECK-NEXT: ldrb.w r3, [r0, #128] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #129] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #130] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: add.w r2, r0, #127 +; CHECK-NEXT: vldrb.u32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 127 %0 = bitcast i8* %z to <4 x i8>* @@ -512,18 +415,10 @@ define i8* @post_ldrbu32_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb.w r2, [r0, #128] -; CHECK-NEXT: ldrb.w r3, [r0, #129] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #130] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #131] -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: add.w r2, r0, #128 +; CHECK-NEXT: vldrb.u32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 128 %0 = bitcast i8* %z to <4 x i8>* @@ -538,14 +433,7 @@ define i8* @post_ldrbs32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrb.s32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -561,14 +449,8 @@ define i8* @post_ldrbs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrb.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -584,14 +466,8 @@ define i8* @post_ldrbs32_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #127] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #128] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #129] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #130] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: add.w r2, r0, #127 +; CHECK-NEXT: vldrb.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -607,14 +483,8 @@ define i8* @post_ldrbs32_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #128] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #129] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #130] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #131] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: add.w r2, r0, #128 +; CHECK-NEXT: vldrb.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -631,26 +501,9 @@ define i8* @post_ldrbu16_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #4] -; CHECK-NEXT: ldrb r3, [r0, #5] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #6] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #7] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb r2, [r0, #8] -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrb r2, [r0, #9] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #10] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #11] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vldrb.u16 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 4 %0 = bitcast i8* %z to <8 x i8>* @@ -664,26 +517,10 @@ define i8* @post_ldrbu16_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #3] -; CHECK-NEXT: ldrb r3, [r0, #4] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #5] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #6] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrb r2, [r0, #8] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #9] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #10] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrb.u16 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 %0 = bitcast i8* %z to <8 x i8>* @@ -697,26 +534,10 @@ define i8* @post_ldrbu16_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb.w r2, [r0, #127] -; CHECK-NEXT: ldrb.w r3, [r0, #128] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #129] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #130] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb.w r2, [r0, #131] -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrb.w r2, [r0, #132] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb.w r2, [r0, #133] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb.w r2, [r0, #134] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: add.w r2, r0, #127 +; CHECK-NEXT: vldrb.u16 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 127 %0 = bitcast i8* %z to <8 x i8>* @@ -730,26 +551,10 @@ define i8* @post_ldrbu16_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb.w r2, [r0, #128] -; CHECK-NEXT: ldrb.w r3, [r0, #129] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #130] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #131] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb.w r2, [r0, #132] -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrb.w r2, [r0, #133] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb.w r2, [r0, #134] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb.w r2, [r0, #135] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: add.w r2, r0, #128 +; CHECK-NEXT: vldrb.u16 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 128 %0 = bitcast i8* %z to <8 x i8>* @@ -764,22 +569,7 @@ define i8* @post_ldrbs16_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #8] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #9] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #10] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #11] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vldrb.s16 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -795,22 +585,8 @@ define i8* @post_ldrbs16_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #8] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #9] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #10] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: vldrb.s16 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -826,22 +602,8 @@ define i8* @post_ldrbs16_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #127] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #128] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #129] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #130] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #131] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #132] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #133] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #134] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: add.w r2, r0, #127 +; CHECK-NEXT: vldrb.s16 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -857,22 +619,8 @@ define i8* @post_ldrbs16_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0, #128] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #129] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #130] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #131] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #132] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #133] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #134] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #135] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: add.w r2, r0, #128 +; CHECK-NEXT: vldrb.s16 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1096,8 +844,8 @@ define i8* @post_strh32_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0, #4] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1111,9 +859,9 @@ define i8* @post_strh32_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str.w r1, [r0, #3] -; CHECK-NEXT: str.w r2, [r0, #7] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1127,9 +875,9 @@ define i8* @post_strh32_2(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str.w r1, [r0, #2] -; CHECK-NEXT: str.w r2, [r0, #6] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1143,9 +891,9 @@ define i8* @post_strh32_254(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str.w r1, [r0, #254] -; CHECK-NEXT: str.w r2, [r0, #258] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #254 +; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -1159,8 +907,9 @@ define i8* @post_strh32_256(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0, #256] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #256 +; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 256 @@ -1255,8 +1004,8 @@ define i8* @post_strb32_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0, #4] +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1270,8 +1019,9 @@ define i8* @post_strb32_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str.w r1, [r0, #3] +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vstrb.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1285,8 +1035,9 @@ define i8* @post_strb32_127(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str.w r1, [r0, #127] +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #127 +; CHECK-NEXT: vstrb.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -1300,8 +1051,9 @@ define i8* @post_strb32_128(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str.w r1, [r0, #128] +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #128 +; CHECK-NEXT: vstrb.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 128 @@ -1316,8 +1068,8 @@ define i8* @post_strb16_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0, #4] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1331,9 +1083,9 @@ define i8* @post_strb16_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str.w r1, [r0, #3] -; CHECK-NEXT: str.w r2, [r0, #7] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vstrb.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1347,9 +1099,9 @@ define i8* @post_strb16_127(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str.w r1, [r0, #127] -; CHECK-NEXT: str.w r2, [r0, #131] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #127 +; CHECK-NEXT: vstrb.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -1363,8 +1115,9 @@ define i8* @post_strb16_128(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0, #128] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #128 +; CHECK-NEXT: vstrb.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 128 Index: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll @@ -117,16 +117,8 @@ define i8* @post_ldrhu32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: ldrh r3, [r2, #4]! -; CHECK-NEXT: ldrh r0, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -142,15 +134,8 @@ define i8* @post_ldrhu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: adds r0, #3 -; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -166,16 +151,8 @@ define i8* @post_ldrhu32_2(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: ldrh r3, [r2, #2]! -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrh r3, [r0, #4] -; CHECK-NEXT: ldrh r0, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -191,15 +168,8 @@ define i8* @post_ldrhu32_254(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: adds r0, #254 -; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -215,15 +185,8 @@ define i8* @post_ldrhu32_256(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #256 -; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -240,14 +203,8 @@ define i8* @post_ldrhs32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsh.w r2, [r0] -; CHECK-NEXT: ldrsh.w r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #6] -; CHECK-NEXT: ldrsh r2, [r0, #4]! -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -263,19 +220,10 @@ define i8* @post_ldrhs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0] -; CHECK-NEXT: ldrsh.w r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: adds r0, #3 -; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 %0 = bitcast i8* %x to <4 x i16>* @@ -289,14 +237,8 @@ define i8* @post_ldrhs32_2(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsh.w r2, [r0] -; CHECK-NEXT: ldrsh.w r3, [r0, #4] -; CHECK-NEXT: ldrsh.w r12, [r0, #6] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh r2, [r0, #2]! -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -312,19 +254,10 @@ define i8* @post_ldrhs32_254(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0] -; CHECK-NEXT: ldrsh.w r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: adds r0, #254 -; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 254 %0 = bitcast i8* %x to <4 x i16>* @@ -338,19 +271,10 @@ define i8* @post_ldrhs32_256(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0] -; CHECK-NEXT: ldrsh.w r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #256 -; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 256 %0 = bitcast i8* %x to <4 x i16>* @@ -446,19 +370,10 @@ define i8* @post_ldrbu32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 4 %0 = bitcast i8* %x to <4 x i8>* @@ -472,14 +387,8 @@ define i8* @post_ldrbu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r0, #3]! -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -495,19 +404,10 @@ define i8* @post_ldrbu32_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: adds r0, #127 -; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 127 %0 = bitcast i8* %x to <4 x i8>* @@ -521,19 +421,10 @@ define i8* @post_ldrbu32_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 128 %0 = bitcast i8* %x to <4 x i8>* @@ -548,15 +439,8 @@ define i8* @post_ldrbs32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vldrb.s32 q0, [r0] ; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -572,14 +456,8 @@ define i8* @post_ldrbs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb r2, [r0, #3]! -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrb.s32 q0, [r0] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -595,15 +473,8 @@ define i8* @post_ldrbs32_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vldrb.s32 q0, [r0] ; CHECK-NEXT: adds r0, #127 -; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -619,15 +490,8 @@ define i8* @post_ldrbs32_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] +; CHECK-NEXT: vldrb.s32 q0, [r0] ; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -644,28 +508,10 @@ define i8* @post_ldrbu16_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb r3, [r2, #4]! -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r3 -; CHECK-NEXT: ldrb r3, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: ldrb r3, [r0, #6] -; CHECK-NEXT: ldrb r0, [r0, #7] -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 4 %0 = bitcast i8* %x to <8 x i8>* @@ -679,28 +525,10 @@ define i8* @post_ldrbu16_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: ldrb r3, [r2, #3]! -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb.w lr, [r0, #4] -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: ldrb r3, [r0, #5] -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: ldrb r3, [r0, #6] -; CHECK-NEXT: ldrb r0, [r0, #7] -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 %0 = bitcast i8* %x to <8 x i8>* @@ -714,27 +542,10 @@ define i8* @post_ldrbu16_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrb r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: adds r0, #127 -; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 127 %0 = bitcast i8* %x to <8 x i8>* @@ -748,27 +559,10 @@ define i8* @post_ldrbu16_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrb r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrb r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #7] +; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 128 %0 = bitcast i8* %x to <8 x i8>* @@ -783,24 +577,8 @@ define i8* @post_ldrbs16_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: ldrsb r3, [r2, #4]! -; CHECK-NEXT: vmov.16 q0[4], r3 -; CHECK-NEXT: ldrsb.w r3, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: ldrsb.w r3, [r0, #6] -; CHECK-NEXT: ldrsb.w r0, [r0, #7] -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrb.s16 q0, [r0] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -816,24 +594,8 @@ define i8* @post_ldrbs16_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: ldrsb r3, [r2, #3]! -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: ldrsb.w r3, [r0, #4] -; CHECK-NEXT: vmov.16 q0[4], r3 -; CHECK-NEXT: ldrsb.w r3, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: ldrsb.w r3, [r0, #6] -; CHECK-NEXT: ldrsb.w r0, [r0, #7] -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrb.s16 q0, [r0] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -849,23 +611,8 @@ define i8* @post_ldrbs16_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vldrb.s16 q0, [r0] ; CHECK-NEXT: adds r0, #127 -; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -881,23 +628,8 @@ define i8* @post_ldrbs16_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb.w r2, [r0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] +; CHECK-NEXT: vldrb.s16 q0, [r0] ; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1127,9 +859,9 @@ define i8* @post_strh32_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: str r2, [r0, #4]! +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1143,8 +875,8 @@ define i8* @post_strh32_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: bx lr entry: @@ -1159,8 +891,8 @@ define i8* @post_strh32_2(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: bx lr entry: @@ -1175,8 +907,8 @@ define i8* @post_strh32_254(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: adds r0, #254 ; CHECK-NEXT: bx lr entry: @@ -1191,8 +923,8 @@ define i8* @post_strh32_256(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: bx lr entry: @@ -1289,8 +1021,9 @@ define i8* @post_strb32_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0], #4 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1304,8 +1037,9 @@ define i8* @post_strb32_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0], #3 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1319,8 +1053,9 @@ define i8* @post_strb32_127(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0], #127 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: adds r0, #127 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -1334,8 +1069,9 @@ define i8* @post_strb32_128(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0], #128 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 128 @@ -1350,9 +1086,9 @@ define i8* @post_strb16_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: str r2, [r0, #4]! +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1366,8 +1102,8 @@ define i8* @post_strb16_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: bx lr entry: @@ -1382,8 +1118,8 @@ define i8* @post_strb16_127(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: adds r0, #127 ; CHECK-NEXT: bx lr entry: @@ -1398,8 +1134,8 @@ define i8* @post_strb16_128(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: bx lr entry: Index: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll @@ -117,14 +117,8 @@ define i8* @post_ldrhu32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0, #4]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrh.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -140,14 +134,8 @@ define i8* @post_ldrhu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0, #3]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -163,14 +151,8 @@ define i8* @post_ldrhu32_2(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0, #2]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -186,14 +168,8 @@ define i8* @post_ldrhu32_254(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0, #254]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh r2, [r0, #4] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -209,15 +185,8 @@ define i8* @post_ldrhu32_256(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhu32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh.w r2, [r0, #256] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #258] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #260] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrh.w r2, [r0, #262] ; CHECK-NEXT: add.w r0, r0, #256 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -234,14 +203,8 @@ define i8* @post_ldrhs32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsh r2, [r0, #4]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r2, [r0, #2] -; CHECK-NEXT: ldrsh.w r3, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vldrh.s32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -257,14 +220,8 @@ define i8* @post_ldrhs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsh r2, [r0, #3]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r2, [r0, #2] -; CHECK-NEXT: ldrsh.w r3, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -280,14 +237,8 @@ define i8* @post_ldrhs32_2(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsh r2, [r0, #2]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r2, [r0, #2] -; CHECK-NEXT: ldrsh.w r3, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -303,14 +254,8 @@ define i8* @post_ldrhs32_254(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsh r2, [r0, #254]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r2, [r0, #2] -; CHECK-NEXT: ldrsh.w r3, [r0, #4] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #6] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -326,19 +271,10 @@ define i8* @post_ldrhs32_256(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrhs32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrsh.w r2, [r0, #256] -; CHECK-NEXT: ldrsh.w r3, [r0, #258] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsh.w r12, [r0, #260] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrsh.w lr, [r0, #262] -; CHECK-NEXT: vmov.32 q0[2], r12 ; CHECK-NEXT: add.w r0, r0, #256 -; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 256 %0 = bitcast i8* %z to <4 x i16>* @@ -434,14 +370,8 @@ define i8* @post_ldrbu32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrb r2, [r0, #4]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -457,14 +387,8 @@ define i8* @post_ldrbu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrb r2, [r0, #3]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -480,14 +404,8 @@ define i8* @post_ldrbu32_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrb r2, [r0, #127]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -503,14 +421,8 @@ define i8* @post_ldrbu32_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrb r2, [r0, #128]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -527,14 +439,8 @@ define i8* @post_ldrbs32_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #4]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrb.s32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -550,14 +456,8 @@ define i8* @post_ldrbs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #3]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrb.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -573,14 +473,8 @@ define i8* @post_ldrbs32_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #127]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrb.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -596,14 +490,8 @@ define i8* @post_ldrbs32_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #128]! -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -620,26 +508,10 @@ define i8* @post_ldrbu16_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #4]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #4] -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: ldrb r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vldrb.u16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 4 %0 = bitcast i8* %z to <8 x i8>* @@ -653,26 +525,10 @@ define i8* @post_ldrbu16_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #3]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #4] -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: ldrb r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 %0 = bitcast i8* %z to <8 x i8>* @@ -686,26 +542,10 @@ define i8* @post_ldrbu16_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #127]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #4] -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: ldrb r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 127 %0 = bitcast i8* %z to <8 x i8>* @@ -719,26 +559,10 @@ define i8* @post_ldrbu16_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbu16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrb r2, [r0, #128]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r0, #1] -; CHECK-NEXT: ldrb r3, [r0, #2] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #3] -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #4] -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: ldrb r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrb r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrb r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 128 %0 = bitcast i8* %z to <8 x i8>* @@ -753,22 +577,8 @@ define i8* @post_ldrbs16_4(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #4]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vldrb.s16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -784,22 +594,8 @@ define i8* @post_ldrbs16_3(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #3]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrb.s16 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -815,22 +611,8 @@ define i8* @post_ldrbs16_127(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #127]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrb.s16 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -846,22 +628,8 @@ define i8* @post_ldrbs16_128(i8* %x, i8* %y) { ; CHECK-LABEL: post_ldrbs16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrsb r2, [r0, #128]! -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: ldrsb.w r2, [r0, #7] -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.s16 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1091,9 +859,9 @@ define i8* @post_strh32_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #4]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1107,9 +875,9 @@ define i8* @post_strh32_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #3]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1123,9 +891,9 @@ define i8* @post_strh32_2(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #2]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1139,9 +907,9 @@ define i8* @post_strh32_254(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #254]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 254 @@ -1155,9 +923,9 @@ define i8* @post_strh32_256(i8* %y, i8* %x) { ; CHECK-LABEL: post_strh32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: strd r1, r2, [r0, #256] ; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 256 @@ -1253,8 +1021,9 @@ define i8* @post_strb32_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0, #4]! +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1268,8 +1037,9 @@ define i8* @post_strb32_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0, #3]! +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1283,8 +1053,9 @@ define i8* @post_strb32_127(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0, #127]! +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -1298,8 +1069,9 @@ define i8* @post_strb32_128(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: str r1, [r0, #128]! +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 128 @@ -1314,9 +1086,9 @@ define i8* @post_strb16_4(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #4]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 4 @@ -1330,9 +1102,9 @@ define i8* @post_strb16_3(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #3]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1346,9 +1118,9 @@ define i8* @post_strb16_127(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #127]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 127 @@ -1362,9 +1134,9 @@ define i8* @post_strb16_128(i8* %y, i8* %x) { ; CHECK-LABEL: post_strb16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r2, [r1] -; CHECK-NEXT: str r1, [r0, #128]! -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 128 Index: llvm/trunk/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-shuffle.ll @@ -494,15 +494,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: movs r0, #7 -; CHECK-NEXT: movs r1, #1 -; CHECK-NEXT: strh.w r0, [sp, #2] +; CHECK-NEXT: adr r1, .LCPI30_0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: strh.w r0, [sp] -; CHECK-NEXT: movt r1, #9 -; CHECK-NEXT: ldr r0, [sp] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vstrh.32 q0, [r2] +; CHECK-NEXT: ldrd r0, r1, [sp], #8 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI30_0: +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 9 @ 0x9 entry: %f = shufflevector <8 x i16> %v, <8 x i16> , <4 x i32> %0 = bitcast <4 x i16> %f to i64 Index: llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define void @foo_int8_int32(<4 x i8>* %dest, <4 x i32>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int8_int32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i32>, <4 x i32>* %src, align 4 + %0 = trunc <4 x i32> %wide.load to <4 x i8> + store <4 x i8> %0, <4 x i8>* %dest, align 1 + ret void +} + + +define void @foo_int16_int32(<4 x i16>* %dest, <4 x i32>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int16_int32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i32>, <4 x i32>* %src, align 4 + %0 = trunc <4 x i32> %wide.load to <4 x i16> + store <4 x i16> %0, <4 x i16>* %dest, align 2 + ret void +} + + +define void @foo_int8_int16(<8 x i8>* %dest, <8 x i16>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int8_int16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <8 x i16>, <8 x i16>* %src, align 2 + %0 = trunc <8 x i16> %wide.load to <8 x i8> + store <8 x i8> %0, <8 x i8>* %dest, align 1 + ret void +} + + +define void @foo_int32_int8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int32_int8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i8>, <4 x i8>* %src, align 1 + %0 = sext <4 x i8> %wide.load to <4 x i32> + store <4 x i32> %0, <4 x i32>* %dest, align 4 + ret void +} + + +define void @foo_int16_int8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int16_int8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s16 q0, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <8 x i8>, <8 x i8>* %src, align 1 + %0 = sext <8 x i8> %wide.load to <8 x i16> + store <8 x i16> %0, <8 x i16>* %dest, align 2 + ret void +} + + +define void @foo_int32_int16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int32_int16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i16>, <4 x i16>* %src, align 2 + %0 = sext <4 x i16> %wide.load to <4 x i32> + store <4 x i32> %0, <4 x i32>* %dest, align 4 + ret void +} + + +define void @foo_uint32_uint8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_uint32_uint8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i8>, <4 x i8>* %src, align 1 + %0 = zext <4 x i8> %wide.load to <4 x i32> + store <4 x i32> %0, <4 x i32>* %dest, align 4 + ret void +} + + +define void @foo_uint16_uint8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_uint16_uint8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <8 x i8>, <8 x i8>* %src, align 1 + %0 = zext <8 x i8> %wide.load to <8 x i16> + store <8 x i16> %0, <8 x i16>* %dest, align 2 + ret void +} + + +define void @foo_uint32_uint16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_uint32_uint16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i16>, <4 x i16>* %src, align 2 + %0 = zext <4 x i16> %wide.load to <4 x i32> + store <4 x i32> %0, <4 x i32>* %dest, align 4 + ret void +}