Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4382,6 +4382,10 @@ // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; +def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast(N)->getAlignment() >= 4; +}]>; + let Predicates = [HasMVEInt] in { foreach VT = [ v4i1, v8i1, v16i1 ] in { def : Pat<(i32 (predicate_cast (VT VCCR:$src))), @@ -4394,6 +4398,13 @@ (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } + // If we happen to be casting from a load we can convert that straight + // into a predicate load, so long as the load is of the correct type. + foreach VT = [ v4i1, v8i1, v16i1 ] in { + def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))), + (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>; + } + // Here we match the specific SDNode type 'ARMVectorRegCastImpl' // rather than the more general 'ARMVectorRegCast' which would also // match some bitconverts. If we use the latter in cases where the Index: llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll @@ -386,18 +386,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4(i32* %i, <4 x i32> %a) { ; CHECK-LE-LABEL: load_bc4: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: ldr r0, [r0] +; CHECK-LE-NEXT: vldr p0, [r0] ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_bc4: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: ldr r0, [r0] +; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -410,19 +408,17 @@ define arm_aapcs_vfpcc <8 x i16> @load_predcast8(i32* %i, <8 x i16> %a) { ; CHECK-LE-LABEL: load_predcast8: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: ldr r0, [r0] +; CHECK-LE-NEXT: vldr p0, [r0] ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_predcast8: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: ldr r0, [r0] ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vrev32.16 q0, q0 -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -435,19 +431,17 @@ define arm_aapcs_vfpcc <16 x i8> @load_predcast16(i32* %i, <16 x i8> %a) { ; CHECK-LE-LABEL: load_predcast16: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: ldr r0, [r0] +; CHECK-LE-NEXT: vldr p0, [r0] ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_predcast16: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: ldr r0, [r0] ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vrev32.8 q0, q0 -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.8 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -484,18 +478,18 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_offset(i16* %i, <4 x i32> %a) { ; CHECK-LE-LABEL: load_bc4_offset: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: ldr.w r0, [r0, #6] +; CHECK-LE-NEXT: adds r0, #6 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vldr p0, [r0] ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_bc4_offset: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: ldr.w r0, [r0, #6] +; CHECK-BE-NEXT: adds r0, #6 ; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -510,18 +504,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range4(i32* %i, <4 x i32> %a) { ; CHECK-LE-LABEL: load_bc4_range4: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: ldr r0, [r0, #4] +; CHECK-LE-NEXT: vldr p0, [r0, #4] ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_bc4_range4: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: ldr r0, [r0, #4] +; CHECK-BE-NEXT: vldr p0, [r0, #4] ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -535,18 +527,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range(i32* %i, <4 x i32> %a) { ; CHECK-LE-LABEL: load_bc4_range: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: ldr.w r0, [r0, #508] +; CHECK-LE-NEXT: vldr p0, [r0, #508] ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_bc4_range: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: ldr.w r0, [r0, #508] +; CHECK-BE-NEXT: vldr p0, [r0, #508] ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -560,22 +550,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range2(i32* %i, <4 x i32> %a) { ; CHECK-LE-LABEL: load_bc4_range2: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: movw r1, #65028 +; CHECK-LE-NEXT: vldr p0, [r0, #-508] ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: movt r1, #65535 -; CHECK-LE-NEXT: ldr r0, [r0, r1] -; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_bc4_range2: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: movw r1, #65028 +; CHECK-BE-NEXT: vldr p0, [r0, #-508] ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: movt r1, #65535 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: ldr r0, [r0, r1] -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -589,18 +573,18 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range3(i32* %i, <4 x i32> %a) { ; CHECK-LE-LABEL: load_bc4_range3: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: ldr.w r0, [r0, #512] +; CHECK-LE-NEXT: add.w r0, r0, #512 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vldr p0, [r0] ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_bc4_range3: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: ldr.w r0, [r0, #512] +; CHECK-BE-NEXT: add.w r0, r0, #512 ; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -614,22 +598,18 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range5(i32* %i, <4 x i32> %a) { ; CHECK-LE-LABEL: load_bc4_range5: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: movw r1, #65024 +; CHECK-LE-NEXT: sub.w r0, r0, #512 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: movt r1, #65535 -; CHECK-LE-NEXT: ldr r0, [r0, r1] -; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vldr p0, [r0] ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_bc4_range5: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: movw r1, #65024 +; CHECK-BE-NEXT: sub.w r0, r0, #512 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: movt r1, #65535 +; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: ldr r0, [r0, r1] -; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr