Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -14066,11 +14066,18 @@ return true; } + // These are for truncated stores/narrowing loads. They are fine so long as + // the alignment is at least the size of the item being loaded + if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && + Alignment >= VT.getScalarSizeInBits() / 8) { + if (Fast) + *Fast = true; + return true; + } + if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && - Ty != MVT::v2f64 && - // These are for truncated stores - Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16) + Ty != MVT::v2f64) return false; if (Subtarget->isLittle()) { Index: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td +++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td @@ -4839,36 +4839,48 @@ // Widening/Narrowing Loads/Stores +let MinAlignment = 2 in { + def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr), + (truncstorevi16 node:$val, node:$ptr)>; +} + let Predicates = [HasMVEInt] in { - def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr), - (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>; - def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr), - (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>; - def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr), - (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>; + def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr)>; + def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr)>; + def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr), + (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr)>; +} + + +let MinAlignment = 2 in { + def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>; + def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>; + def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>; } multiclass MVEExtLoad { + string Align, Operand am> { def _Any : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) - (!cast("extloadvi" # SrcElemBits) am:$addr)), + (!cast("extloadvi" # SrcElemBits # Align) am:$addr)), (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _Z : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) - (!cast("zextloadvi" # SrcElemBits) am:$addr)), + (!cast("zextloadvi" # SrcElemBits # Align) am:$addr)), (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _S : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) - (!cast("sextloadvi" # SrcElemBits) am:$addr)), + (!cast("sextloadvi" # SrcElemBits # Align) am:$addr)), (!cast("MVE_VLDR" # SrcElemType # "S" # DestElemBits) am:$addr)>; } let Predicates = [HasMVEInt] in { - defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>; - defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>; - defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>; + defm : MVEExtLoad<"4", "32", "8", "B", "", t2addrmode_imm7<0>>; + defm : MVEExtLoad<"8", "16", "8", "B", "", t2addrmode_imm7<0>>; + defm : MVEExtLoad<"4", "32", "16", "H", "_align2", t2addrmode_imm7<1>>; } Index: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll @@ -148,8 +148,7 @@ define i8* @ldrhu32_2(i8* %x, i8* %y) { ; CHECK-LABEL: ldrhu32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r2, r0, #2 -; CHECK-NEXT: vldrh.u32 q0, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r0, #2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -233,8 +232,7 @@ define i8* @ldrhs32_2(i8* %x, i8* %y) { ; CHECK-LABEL: ldrhs32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r2, r0, #2 -; CHECK-NEXT: vldrh.s32 q0, [r2] +; CHECK-NEXT: vldrh.s32 q0, [r0, #2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -380,8 +378,7 @@ define i8* @ldrbu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r2, r0, #3 -; CHECK-NEXT: vldrb.u32 q0, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r0, #3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -448,8 +445,7 @@ define i8* @ldrbs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r2, r0, #3 -; CHECK-NEXT: vldrb.s32 q0, [r2] +; CHECK-NEXT: vldrb.s32 q0, [r0, #3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -516,8 +512,7 @@ define i8* @ldrbu16_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbu16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r2, r0, #3 -; CHECK-NEXT: vldrb.u16 q0, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r0, #3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -584,8 +579,7 @@ define i8* @ldrbs16_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbs16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r2, r0, #3 -; CHECK-NEXT: vldrb.s16 q0, [r2] +; CHECK-NEXT: vldrb.s16 q0, [r0, #3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -758,9 +752,15 @@ define i8* @ldrhi32_align1(i8* %x, i8* %y) { ; CHECK-LABEL: ldrhi32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r2, r0, #3 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldr.w r3, [r0, #7] +; CHECK-NEXT: ldr.w r2, [r0, #3] +; CHECK-NEXT: strd r2, r3, [sp] +; CHECK-NEXT: mov r2, sp ; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 @@ -952,8 +952,7 @@ ; CHECK-LABEL: strh32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: adds r1, r0, #2 -; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, #2] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1095,8 +1094,7 @@ ; CHECK-LABEL: strb32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vstrb.32 q0, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, #3] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1159,8 +1157,7 @@ ; CHECK-LABEL: strb16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q0, [r1] -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vstrb.16 q0, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0, #3] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1329,9 +1326,15 @@ define i8* @strhi32_align1(i8* %y, i8* %x) { ; CHECK-LABEL: strhi32_align1: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: ldrd r1, r2, [sp] +; CHECK-NEXT: str.w r1, [r0, #3] +; CHECK-NEXT: str.w r2, [r0, #7] +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 Index: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll @@ -774,9 +774,16 @@ define i8* @ldrhi32_align1(i8* %x, i8* %y) { ; CHECK-LABEL: ldrhi32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0] -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldr r3, [r0, #4] +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: strd r2, r3, [sp] +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 @@ -1360,9 +1367,16 @@ define i8* @strhi32_align1(i8* %y, i8* %x) { ; CHECK-LABEL: strhi32_align1: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: ldrd r1, r2, [sp] +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: str r2, [r0, #4] ; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 Index: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll @@ -151,8 +151,8 @@ define i8* @ldrhu32_2(i8* %x, i8* %y) { ; CHECK-LABEL: ldrhu32_2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r0, #2] ; CHECK-NEXT: adds r0, #2 -; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -237,8 +237,8 @@ define i8* @ldrhs32_2(i8* %x, i8* %y) { ; CHECK-LABEL: ldrhs32_2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q0, [r0, #2] ; CHECK-NEXT: adds r0, #2 -; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -387,8 +387,8 @@ define i8* @ldrbu32_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbu32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r0, #3] ; CHECK-NEXT: adds r0, #3 -; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -456,8 +456,8 @@ define i8* @ldrbs32_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbs32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q0, [r0, #3] ; CHECK-NEXT: adds r0, #3 -; CHECK-NEXT: vldrb.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -525,8 +525,8 @@ define i8* @ldrbu16_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbu16_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r0, #3] ; CHECK-NEXT: adds r0, #3 -; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -594,8 +594,8 @@ define i8* @ldrbs16_3(i8* %x, i8* %y) { ; CHECK-LABEL: ldrbs16_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s16 q0, [r0, #3] ; CHECK-NEXT: adds r0, #3 -; CHECK-NEXT: vldrb.s16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -774,9 +774,16 @@ define i8* @ldrhi32_align1(i8* %x, i8* %y) { ; CHECK-LABEL: ldrhi32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 -; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldr r2, [r0, #3]! +; CHECK-NEXT: str r2, [sp] +; CHECK-NEXT: ldr r2, [r0, #4] +; CHECK-NEXT: str r2, [sp, #4] +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %x, i32 3 @@ -972,9 +979,9 @@ define i8* @strh32_2(i8* %y, i8* %x) { ; CHECK-LABEL: strh32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 2 @@ -1118,9 +1125,9 @@ define i8* @strb32_3(i8* %y, i8* %x) { ; CHECK-LABEL: strb32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: vstrb.32 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1183,9 +1190,9 @@ define i8* @strb16_3(i8* %y, i8* %x) { ; CHECK-LABEL: strb16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrb.u16 q0, [r1] -; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: vstrb.16 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1360,9 +1367,15 @@ define i8* @strhi32_align1(i8* %y, i8* %x) { ; CHECK-LABEL: strhi32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: ldrd r1, r2, [sp] +; CHECK-NEXT: str r1, [r0, #3]! +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 Index: llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s define void @foo_int8_int32(<4 x i8>* %dest, <4 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int32: @@ -14,7 +14,6 @@ ret void } - define void @foo_int16_int32(<4 x i16>* %dest, <4 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int16_int32: ; CHECK: @ %bb.0: @ %entry @@ -28,7 +27,6 @@ ret void } - define void @foo_int8_int16(<8 x i8>* %dest, <8 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int16: ; CHECK: @ %bb.0: @ %entry @@ -42,7 +40,6 @@ ret void } - define void @foo_int32_int8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int32_int8: ; CHECK: @ %bb.0: @ %entry @@ -56,7 +53,6 @@ ret void } - define void @foo_int16_int8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int16_int8: ; CHECK: @ %bb.0: @ %entry @@ -70,7 +66,6 @@ ret void } - define void @foo_int32_int16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int32_int16: ; CHECK: @ %bb.0: @ %entry @@ -84,7 +79,6 @@ ret void } - define void @foo_uint32_uint8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint32_uint8: ; CHECK: @ %bb.0: @ %entry @@ -98,7 +92,6 @@ ret void } - define void @foo_uint16_uint8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint16_uint8: ; CHECK: @ %bb.0: @ %entry @@ -112,7 +105,6 @@ ret void } - define void @foo_uint32_uint16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint32_uint16: ; CHECK: @ %bb.0: @ %entry @@ -125,3 +117,66 @@ store <4 x i32> %0, <4 x i32>* %dest, align 4 ret void } + + + + +define void @foo_int16_int32_align1(<4 x i16>* %dest, <4 x i32>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int16_int32_align1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: ldrd r1, r2, [sp] +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i32>, <4 x i32>* %src, align 4 + %0 = trunc <4 x i32> %wide.load to <4 x i16> + store <4 x i16> %0, <4 x i16>* %dest, align 1 + ret void +} + +define void @foo_int32_int16_align1(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_int32_int16_align1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldr r2, [r1] +; CHECK-NEXT: ldr r1, [r1, #4] +; CHECK-NEXT: strd r2, r1, [sp] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i16>, <4 x i16>* %src, align 1 + %0 = sext <4 x i16> %wide.load to <4 x i32> + store <4 x i32> %0, <4 x i32>* %dest, align 4 + ret void +} + +define void @foo_uint32_uint16_align1(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { +; CHECK-LABEL: foo_uint32_uint16_align1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldr r2, [r1] +; CHECK-NEXT: ldr r1, [r1, #4] +; CHECK-NEXT: strd r2, r1, [sp] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %wide.load = load <4 x i16>, <4 x i16>* %src, align 1 + %0 = zext <4 x i16> %wide.load to <4 x i32> + store <4 x i32> %0, <4 x i32>* %dest, align 4 + ret void +}