Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -5391,7 +5391,9 @@ switch (MI.getOpcode()) { case ARM::VSETLNi32: + case ARM::MVE_VMOV_to_lane_32: // dX = VSETLNi32 dY, rZ, imm + // qX = MVE_VMOV_to_lane_32 qY, rZ, imm const MachineOperand &MOBaseReg = MI.getOperand(1); const MachineOperand &MOInsertedReg = MI.getOperand(2); if (MOInsertedReg.isUndef()) @@ -5402,7 +5404,7 @@ InsertedReg.Reg = MOInsertedReg.getReg(); InsertedReg.SubReg = MOInsertedReg.getSubReg(); - InsertedReg.SubIdx = MOIndex.getImm() == 0 ? ARM::ssub_0 : ARM::ssub_1; + InsertedReg.SubIdx = ARM::ssub_0 + MOIndex.getImm(); return true; } llvm_unreachable("Target dependent opcode missing"); Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1825,12 +1825,13 @@ } def MVE_VMOV_from_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_from_lane>; -def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>; def MVE_VMOV_from_lane_s16 : MVE_VMOV_lane_16<"s16", 0b0, MVE_VMOV_from_lane>; def MVE_VMOV_from_lane_u16 : MVE_VMOV_lane_16<"u16", 0b1, MVE_VMOV_from_lane>; -def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>; def MVE_VMOV_from_lane_s8 : MVE_VMOV_lane_8 < "s8", 0b0, MVE_VMOV_from_lane>; def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>; +let isInsertSubreg = 1 in +def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>; +def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>; def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>; // This is the same as insertelt but allows the inserted value to be an i32 as Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -274,11 +274,8 @@ ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -798,11 +795,8 @@ ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -1131,15 +1125,12 @@ ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -1694,15 +1685,12 @@ ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -970,23 +970,20 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: umull r12, r3, r1, r0 -; CHECK-NEXT: mla r1, r1, r2, r3 +; CHECK-NEXT: umull r12, r2, r1, r0 +; CHECK-NEXT: mla r1, r1, r3, r2 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.32 q2[0], r12 +; CHECK-NEXT: mla lr, r2, r0, r1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: umull r3, r1, r2, r0 +; CHECK-NEXT: mla r1, r2, r4, r1 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: mla r1, r2, r0, r1 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov r12, s8 -; CHECK-NEXT: umull lr, r0, r3, r2 -; CHECK-NEXT: mla r0, r3, r4, r0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: mla r2, r3, r2, r0 -; CHECK-NEXT: adds.w r0, r12, lr -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds.w r0, r12, r3 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: pop {r4, pc} entry: %m = mul <2 x i64> %x, %y @@ -1843,23 +1840,20 @@ ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r4, s5 ; CHECK-NEXT: vmov r6, s7 -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: mla r3, r3, r4, lr +; CHECK-NEXT: umull lr, r12, r3, r2 +; CHECK-NEXT: mla r3, r3, r4, r12 ; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov.32 q2[0], r12 -; CHECK-NEXT: mla r2, r4, r2, r3 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov r12, s8 -; CHECK-NEXT: umull lr, r5, r3, r4 -; CHECK-NEXT: mla r3, r3, r6, r5 +; CHECK-NEXT: mla r12, r4, r2, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: umull r2, r5, r4, r3 +; CHECK-NEXT: mla r4, r4, r6, r5 ; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds.w r6, r12, lr -; CHECK-NEXT: mla r3, r5, r4, r3 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r6 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: mla r3, r5, r3, r4 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %m = mul <2 x i64> %x, %y Index: llvm/test/CodeGen/Thumb2/mve-vld3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -1132,20 +1132,18 @@ ; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] ; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r3 ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q2[0], r0 ; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vadd.f16 q1, q2, q3 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vadd.f16 q0, q1, q2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: bx lr @@ -1179,16 +1177,15 @@ ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmovx.f16 s13, s3 ; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmovx.f16 s16, s8 ; CHECK-NEXT: vins.f16 s13, s9 ; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vadd.f16 q1, q1, q3 -; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vins.f16 s12, s0 ; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.16 q3[2], r2 ; CHECK-NEXT: vins.f16 s13, s0 ; CHECK-NEXT: vadd.f16 q0, q1, q3 ; CHECK-NEXT: vmov r2, s1 Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -18,7 +18,6 @@ ; CHECK-NEXT: vmov.f32 s8, s7 ; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov.f64 d4, d2 ; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vmov.f32 s10, s0 @@ -1359,15 +1358,14 @@ ; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: vmovx.f16 s12, s8 ; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s8, s9 ; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vins.f16 s5, s12 -; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vins.f16 s6, s0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov r2, s5 @@ -1406,26 +1404,24 @@ ; CHECK-NEXT: vmov.f32 s7, s9 ; CHECK-NEXT: vmovx.f16 s12, s4 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.16 q2[2], r4 ; CHECK-NEXT: vins.f16 s9, s12 ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vins.f16 s10, s12 ; CHECK-NEXT: vmovx.f16 s12, s5 ; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vmovx.f16 s4, s7 ; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q1[0], r0 ; CHECK-NEXT: vins.f16 s4, s12 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.16 q1[2], r2 ; CHECK-NEXT: vins.f16 s5, s0 ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov r0, s5