Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -14075,45 +14075,21 @@ return true; } - if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && - Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && - Ty != MVT::v2f64) - return false; - - if (Subtarget->isLittle()) { - // In little-endian MVE, the store instructions VSTRB.U8, - // VSTRH.U16 and VSTRW.U32 all store the vector register in - // exactly the same format, and differ only in the range of - // their immediate offset field and the required alignment. - // - // In particular, VSTRB.U8 can store a vector at byte alignment. - // So at this stage we can simply say that loads/stores of all - // 128-bit wide vector types are permitted at any alignment, - // because we know at least _one_ instruction can manage that. - // - // Later on we might find that some of those loads are better - // generated as VLDRW.U32 if alignment permits, to take - // advantage of the larger immediate range. But for the moment, - // all that matters is that if we don't lower the load then - // _some_ instruction can handle it. + // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and + // VSTRW.U32 all store the vector register in exactly the same format, and + // differ only in the range of their immediate offset field and the required + // alignment. So there is always a store that can be used, regardless of + // actual type. + // + // For big endian, that is not the case. But can still emit a (VSTRB.U8; + // VREV64.8) pair and get the same effect. This will likely be better than + // aligning the vector through the stack. + if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || + Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || + Ty == MVT::v2f64) { if (Fast) *Fast = true; return true; - } else { - // In big-endian MVE, those instructions aren't so similar - // after all, because they reorder the bytes of the vector - // differently. So this time we can only store a particular - // kind of vector if its alignment is at least the element - // type. And we can't store vectors of i64 or f64 at all - // without having to do some postprocessing, because there's - // no VSTRD.U64. - if (Ty == MVT::v16i8 || - ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) || - ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) { - if (Fast) - *Fast = true; - return true; - } } return false; Index: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td +++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td @@ -4820,13 +4820,6 @@ defm : MVE_unpred_vector_load; defm : MVE_unpred_vector_load; defm : MVE_unpred_vector_load; - - def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), - (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), - (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), - (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; } let Predicates = [HasMVEInt, IsBE] in { @@ -4841,6 +4834,41 @@ def : MVE_unpred_vector_load_typed; def : MVE_unpred_vector_load_typed; def : MVE_unpred_vector_load_typed; + + // Other unaligned loads/stores need to go though a VREV + def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)), + (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)), + (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)), + (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)), + (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)), + (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)), + (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; +} + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), + (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), + (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), + (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; } Index: llvm/trunk/test/CodeGen/Thumb2/mve-be.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-be.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-be.ll @@ -29,47 +29,14 @@ ; ; CHECK-BE-LABEL: load_load_add_store_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r4, r6, r7, lr} -; CHECK-BE-NEXT: push {r4, r6, r7, lr} -; CHECK-BE-NEXT: .setfp r7, sp, #8 -; CHECK-BE-NEXT: add r7, sp, #8 -; CHECK-BE-NEXT: .pad #48 -; CHECK-BE-NEXT: sub sp, #48 -; CHECK-BE-NEXT: mov r4, sp -; CHECK-BE-NEXT: bfc r4, #0, #4 -; CHECK-BE-NEXT: mov sp, r4 -; CHECK-BE-NEXT: ldr.w r12, [r1] -; CHECK-BE-NEXT: ldr r3, [r1, #4] -; CHECK-BE-NEXT: ldr r2, [r1, #8] -; CHECK-BE-NEXT: ldr r1, [r1, #12] -; CHECK-BE-NEXT: strd r2, r1, [sp, #24] -; CHECK-BE-NEXT: mov r1, r0 -; CHECK-BE-NEXT: strd r12, r3, [sp, #16] -; CHECK-BE-NEXT: ldr r2, [r1, #4]! -; CHECK-BE-NEXT: str r2, [sp, #4] -; CHECK-BE-NEXT: ldr r2, [r0] -; CHECK-BE-NEXT: str r2, [sp] -; CHECK-BE-NEXT: mov r2, r1 -; CHECK-BE-NEXT: ldr r3, [r2, #4]! -; CHECK-BE-NEXT: str r3, [sp, #8] -; CHECK-BE-NEXT: ldr r3, [r2, #4] -; CHECK-BE-NEXT: str r3, [sp, #12] -; CHECK-BE-NEXT: add r3, sp, #16 -; CHECK-BE-NEXT: vldrw.u32 q0, [r3] -; CHECK-BE-NEXT: mov r3, sp -; CHECK-BE-NEXT: vldrw.u32 q1, [r3] -; CHECK-BE-NEXT: add r3, sp, #32 +; CHECK-BE-NEXT: vldrb.u8 q0, [r1] +; CHECK-BE-NEXT: vldrb.u8 q1, [r0] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vrev32.8 q1, q1 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 -; CHECK-BE-NEXT: vstrw.32 q0, [r3] -; CHECK-BE-NEXT: ldrd r3, r4, [sp, #40] -; CHECK-BE-NEXT: ldrd r12, lr, [sp, #32] -; CHECK-BE-NEXT: str r4, [r2, #4] -; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: str r3, [r2] -; CHECK-BE-NEXT: str.w lr, [r1] -; CHECK-BE-NEXT: str.w r12, [r0] -; CHECK-BE-NEXT: mov sp, r4 -; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %l1 = load <4 x i32>, <4 x i32>* %src1, align 1 %l2 = load <4 x i32>, <4 x i32>* %src2, align 1 Index: llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll @@ -1,72 +1,138 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) { -; CHECK-LABEL: load_4xi32_a4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a4: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a4: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r0] +; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %vp, align 4 - ret <4 x i32> %0 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) { -; CHECK-LABEL: load_4xi32_a2: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a2: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q0, [r0] +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a2: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrb.u8 q0, [r0] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %vp, align 2 - ret <4 x i32> %0 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) { -; CHECK-LABEL: load_4xi32_a1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrb.u8 q0, [r0] +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrb.u8 q0, [r0] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %vp, align 1 - ret <4 x i32> %0 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 } define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) { -; CHECK-LABEL: store_4xi32_a4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: store_4xi32_a4: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vstrw.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: store_4xi32_a4: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vstrw.32 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: - store <4 x i32> %val, <4 x i32>* %vp, align 4 + %0 = lshr <4 x i32> %val, + store <4 x i32> %0, <4 x i32>* %vp, align 4 ret void } define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) { -; CHECK-LABEL: store_4xi32_a2: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: store_4xi32_a2: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vstrh.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: store_4xi32_a2: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: - store <4 x i32> %val, <4 x i32>* %vp, align 2 + %0 = lshr <4 x i32> %val, + store <4 x i32> %0, <4 x i32>* %vp, align 2 ret void } define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) { -; CHECK-LABEL: store_4xi32_a1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vstrb.8 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: store_4xi32_a1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vstrb.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: store_4xi32_a1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: - store <4 x i32> %val, <4 x i32>* %vp, align 1 + %0 = lshr <4 x i32> %val, + store <4 x i32> %0, <4 x i32>* %vp, align 1 ret void } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) { -; CHECK-LABEL: load_4xi32_a4_offset_pos: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #508 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a4_offset_pos: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: add.w r0, r0, #508 +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a4_offset_pos: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: add.w r0, r0, #508 +; CHECK-BE-NEXT: vldrb.u8 q1, [r0] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %ipoffset = getelementptr inbounds i32, i32* %ip, i32 127 %vp = bitcast i32* %ipoffset to <4 x i32>* @@ -75,11 +141,18 @@ } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) { -; CHECK-LABEL: load_4xi32_a4_offset_neg: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #508 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a4_offset_neg: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: sub.w r0, r0, #508 +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a4_offset_neg: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: sub.w r0, r0, #508 +; CHECK-BE-NEXT: vldrb.u8 q1, [r0] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127 %vp = bitcast i32* %ipoffset to <4 x i32>* @@ -88,19 +161,34 @@ } define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() { -; CHECK-LABEL: loadstore_4xi32_stack_off16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: vmov.i32 q0, #0x1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: movs r0, #3 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] -; CHECK-NEXT: str r0, [sp, #16] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: loadstore_4xi32_stack_off16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vmov.i32 q0, #0x1 +; CHECK-LE-NEXT: mov r0, sp +; CHECK-LE-NEXT: vstrw.32 q0, [r0] +; CHECK-LE-NEXT: movs r0, #3 +; CHECK-LE-NEXT: vstrw.32 q0, [sp, #16] +; CHECK-LE-NEXT: str r0, [sp, #16] +; CHECK-LE-NEXT: vldrw.u32 q0, [sp, #16] +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: loadstore_4xi32_stack_off16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vmov.i32 q0, #0x1 +; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vstrw.32 q0, [r0] +; CHECK-BE-NEXT: movs r0, #3 +; CHECK-BE-NEXT: vstrw.32 q0, [sp, #16] +; CHECK-BE-NEXT: str r0, [sp, #16] +; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr entry: %c = alloca [1 x [5 x [2 x i32]]], align 4 %0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8* @@ -116,19 +204,34 @@ } define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() { -; CHECK-LABEL: loadstore_8xi16_stack_off16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: vmov.i16 q0, #0x1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: movs r0, #3 -; CHECK-NEXT: vstrh.16 q0, [sp, #16] -; CHECK-NEXT: strh.w r0, [sp, #16] -; CHECK-NEXT: vldrh.u16 q0, [sp, #16] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: loadstore_8xi16_stack_off16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vmov.i16 q0, #0x1 +; CHECK-LE-NEXT: mov r0, sp +; CHECK-LE-NEXT: vstrh.16 q0, [r0] +; CHECK-LE-NEXT: movs r0, #3 +; CHECK-LE-NEXT: vstrh.16 q0, [sp, #16] +; CHECK-LE-NEXT: strh.w r0, [sp, #16] +; CHECK-LE-NEXT: vldrh.u16 q0, [sp, #16] +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: loadstore_8xi16_stack_off16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vmov.i16 q0, #0x1 +; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vstrh.16 q0, [r0] +; CHECK-BE-NEXT: movs r0, #3 +; CHECK-BE-NEXT: vstrh.16 q0, [sp, #16] +; CHECK-BE-NEXT: strh.w r0, [sp, #16] +; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr entry: %c = alloca [1 x [10 x [2 x i16]]], align 2 %0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8* @@ -144,19 +247,34 @@ } define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() { -; CHECK-LABEL: loadstore_16xi8_stack_off16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: vmov.i8 q0, #0x1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstrb.8 q0, [r0] -; CHECK-NEXT: movs r0, #3 -; CHECK-NEXT: vstrb.8 q0, [sp, #16] -; CHECK-NEXT: strb.w r0, [sp, #16] -; CHECK-NEXT: vldrb.u8 q0, [sp, #16] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: loadstore_16xi8_stack_off16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vmov.i8 q0, #0x1 +; CHECK-LE-NEXT: mov r0, sp +; CHECK-LE-NEXT: vstrb.8 q0, [r0] +; CHECK-LE-NEXT: movs r0, #3 +; CHECK-LE-NEXT: vstrb.8 q0, [sp, #16] +; CHECK-LE-NEXT: strb.w r0, [sp, #16] +; CHECK-LE-NEXT: vldrb.u8 q0, [sp, #16] +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: loadstore_16xi8_stack_off16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vmov.i8 q0, #0x1 +; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: movs r0, #3 +; CHECK-BE-NEXT: vstrb.8 q0, [sp, #16] +; CHECK-BE-NEXT: strb.w r0, [sp, #16] +; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr entry: %c = alloca [1 x [20 x [2 x i8]]], align 1 %0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8* Index: llvm/trunk/test/CodeGen/Thumb2/mve-pred-spill.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-pred-spill.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-pred-spill.ll @@ -1,81 +1,165 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE -declare arm_aapcs_vfpcc <4 x i32> @ext_i32() -declare arm_aapcs_vfpcc <8 x i16> @ext_i16() -declare arm_aapcs_vfpcc <16 x i8> @ext_i8() +declare arm_aapcs_vfpcc <4 x i32> @ext_i32(<4 x i32> %c) +declare arm_aapcs_vfpcc <8 x i16> @ext_i16(<8 x i16> %c) +declare arm_aapcs_vfpcc <16 x i8> @ext_i8(<16 x i8> %c) define arm_aapcs_vfpcc <4 x i32> @shuffle1_v4i32(<4 x i32> %src, <4 x i32> %a) { -; CHECK-LABEL: shuffle1_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bl ext_i32 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpsel q0, q4, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-LE-LABEL: shuffle1_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vmov q4, q1 +; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-LE-NEXT: bl ext_i32 +; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-LE-NEXT: vpsel q0, q4, q0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: shuffle1_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.32 q4, q1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vpsel q1, q4, q0 +; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bl ext_i32 +; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp eq <4 x i32> %src, zeroinitializer - %ext = call arm_aapcs_vfpcc <4 x i32> @ext_i32() + %s1 = select <4 x i1> %c, <4 x i32> %a, <4 x i32> zeroinitializer + %ext = call arm_aapcs_vfpcc <4 x i32> @ext_i32(<4 x i32> %s1) %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %ext ret <4 x i32> %s } define arm_aapcs_vfpcc <8 x i16> @shuffle1_v8i16(<8 x i16> %src, <8 x i16> %a) { -; CHECK-LABEL: shuffle1_v8i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.i16 eq, q0, zr -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bl ext_i16 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpsel q0, q4, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-LE-LABEL: shuffle1_v8i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vmov q4, q1 +; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-LE-NEXT: bl ext_i16 +; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-LE-NEXT: vpsel q0, q4, q0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: shuffle1_v8i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q4, q1 +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vrev64.16 q2, q0 +; CHECK-BE-NEXT: vrev32.16 q1, q1 +; CHECK-BE-NEXT: vcmp.i16 eq, q2, zr +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bl ext_i16 +; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp eq <8 x i16> %src, zeroinitializer - %ext = call arm_aapcs_vfpcc <8 x i16> @ext_i16() + %s1 = select <8 x i1> %c, <8 x i16> %a, <8 x i16> zeroinitializer + %ext = call arm_aapcs_vfpcc <8 x i16> @ext_i16(<8 x i16> %s1) %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %ext ret <8 x i16> %s } define arm_aapcs_vfpcc <16 x i8> @shuffle1_v16i8(<16 x i8> %src, <16 x i8> %a) { -; CHECK-LABEL: shuffle1_v16i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.i8 eq, q0, zr -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bl ext_i8 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpsel q0, q4, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-LE-LABEL: shuffle1_v16i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vmov q4, q1 +; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-LE-NEXT: bl ext_i8 +; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-LE-NEXT: vpsel q0, q4, q0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: shuffle1_v16i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.8 q4, q1 +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vrev64.8 q2, q0 +; CHECK-BE-NEXT: vrev32.8 q1, q1 +; CHECK-BE-NEXT: vcmp.i8 eq, q2, zr +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bl ext_i8 +; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp eq <16 x i8> %src, zeroinitializer - %ext = call arm_aapcs_vfpcc <16 x i8> @ext_i8() + %s1 = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer + %ext = call arm_aapcs_vfpcc <16 x i8> @ext_i8(<16 x i8> %s1) %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %ext ret <16 x i8> %s } Index: llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE define void @foo_int8_int32(<4 x i8>* %dest, <4 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int32: