Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5094,6 +5094,19 @@ (v16i8 (mul (v16i8 MQPR:$src2), (v16i8 (ARMvdup (i32 rGPR:$x))))))), (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>; + + def : Pat<(v4i32 (add (v4i32 (ARMvdup (i32 rGPR:$x))), + (v4i32 (mul (v4i32 MQPR:$src1), + (v4i32 MQPR:$src2))))), + (v4i32 (MVE_VMLAS_qr_u32 $src1, $src2, $x))>; + def : Pat<(v8i16 (add (v8i16 (ARMvdup (i32 rGPR:$x))), + (v8i16 (mul (v8i16 MQPR:$src1), + (v8i16 MQPR:$src2))))), + (v8i16 (MVE_VMLAS_qr_u16 $src1, $src2, $x))>; + def : Pat<(v16i8 (add (v16i8 (ARMvdup (i32 rGPR:$x))), + (v16i8 (mul (v16i8 MQPR:$src1), + (v16i8 MQPR:$src2))))), + (v16i8 (MVE_VMLAS_qr_u8 $src1, $src2, $x))>; } let Predicates = [HasMVEFloat] in { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -404,9 +404,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3], #16 +; CHECK-NEXT: vmlas.u32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new @@ -609,9 +608,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3], #16 +; CHECK-NEXT: vmlas.u32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} @@ -697,9 +695,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3], #16 +; CHECK-NEXT: vmlas.u32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new @@ -902,9 +899,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3], #16 +; CHECK-NEXT: vmlas.u32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} @@ -990,9 +986,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vstrw.32 q0, [r3], #16 +; CHECK-NEXT: vmlas.u32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r3], #16 ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new Index: llvm/test/CodeGen/Thumb2/mve-vmla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmla.ll +++ llvm/test/CodeGen/Thumb2/mve-vmla.ll @@ -197,8 +197,7 @@ define arm_aapcs_vfpcc <4 x i32> @vmlasu32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { ; CHECK-LABEL: vmlasu32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmlas.u32 q0, q1, r0 ; CHECK-NEXT: bx lr entry: %0 = insertelement <4 x i32> undef, i32 %X, i32 0 @@ -211,8 +210,7 @@ define arm_aapcs_vfpcc <4 x i32> @vmlasu32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { ; CHECK-LABEL: vmlasu32b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmlas.u32 q0, q1, r0 ; CHECK-NEXT: bx lr entry: %0 = insertelement <4 x i32> undef, i32 %X, i32 0 @@ -225,8 +223,7 @@ define arm_aapcs_vfpcc <8 x i16> @vmlasu16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { ; CHECK-LABEL: vmlasu16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, r0 +; CHECK-NEXT: vmlas.u16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: %0 = insertelement <8 x i16> undef, i16 %X, i32 0 @@ -239,8 +236,7 @@ define arm_aapcs_vfpcc <8 x i16> @vmlasu16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { ; CHECK-LABEL: vmlasu16b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, r0 +; CHECK-NEXT: vmlas.u16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: %0 = insertelement <8 x i16> undef, i16 %X, i32 0 @@ -253,8 +249,7 @@ define arm_aapcs_vfpcc <16 x i8> @vmlasu8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { ; CHECK-LABEL: vmlasu8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.i8 q0, q0, q1 -; CHECK-NEXT: vadd.i8 q0, q0, r0 +; CHECK-NEXT: vmlas.u8 q0, q1, r0 ; CHECK-NEXT: bx lr entry: %0 = insertelement <16 x i8> undef, i8 %X, i32 0 @@ -267,8 +262,7 @@ define arm_aapcs_vfpcc <16 x i8> @vmlasu8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { ; CHECK-LABEL: vmlasu8b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.i8 q0, q0, q1 -; CHECK-NEXT: vadd.i8 q0, q0, r0 +; CHECK-NEXT: vmlas.u8 q0, q1, r0 ; CHECK-NEXT: bx lr entry: %0 = insertelement <16 x i8> undef, i8 %X, i32 0 @@ -286,9 +280,8 @@ ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vmlas.u32 q1, q0, r1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB15_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -325,9 +318,8 @@ ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vadd.i16 q0, q0, r1 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vmlas.u16 q1, q0, r1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB16_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -364,9 +356,8 @@ ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: subs r3, #16 -; CHECK-NEXT: vmul.i8 q0, q1, q0 -; CHECK-NEXT: vadd.i8 q0, q0, r1 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vmlas.u8 q1, q0, r1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB17_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr