Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -400,13 +400,16 @@ [(int_aarch64_sve_sub node:$pred, node:$op1, node:$op2), (sub node:$op1, (vselect node:$pred, node:$op2, (SVEDup0)))]>; def AArch64mla_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), - [(int_aarch64_sve_mla node:$pred, node:$op1, node:$op2, node:$op3), - (add node:$op1, (AArch64mul_p_oneuse node:$pred, node:$op2, node:$op3)), - // add(a, select(mask, mul(b, c), splat(0))) -> mla(a, mask, b, c) - (add node:$op1, (vselect node:$pred, (AArch64mul_p_oneuse (SVEAllActive), node:$op2, node:$op3), (SVEDup0)))]>; + [(int_aarch64_sve_mla node:$pred, node:$op1, node:$op2, node:$op3)]>; +// pattern for generating pseudo for MLA_ZPmZZ/MAD_ZPmZZ +def AArch64mla_p : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), + [(add node:$op1, (AArch64mul_p_oneuse node:$pred, node:$op2, node:$op3)), + // add(a, select(mask, mul(b, c), splat(0))) -> mla(a, mask, b, c) + (add node:$op1, (vselect node:$pred, (AArch64mul_p_oneuse (SVEAllActive), node:$op2, node:$op3), (SVEDup0)))]>; def AArch64mls_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), - [(int_aarch64_sve_mls node:$pred, node:$op1, node:$op2, node:$op3), - (sub node:$op1, (AArch64mul_p_oneuse node:$pred, node:$op2, node:$op3)), + [(int_aarch64_sve_mls node:$pred, node:$op1, node:$op2, node:$op3)]>; +def AArch64mls_p : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), + [(sub node:$op1, (AArch64mul_p_oneuse node:$pred, node:$op2, node:$op3)), // sub(a, select(mask, mul(b, c), splat(0))) -> mls(a, mask, b, c) (sub node:$op1, (vselect node:$pred, (AArch64mul_p_oneuse (SVEAllActive), node:$op2, node:$op3), (SVEDup0)))]>; def AArch64eor3 : PatFrags<(ops node:$op1, node:$op2, node:$op3), @@ -483,10 +486,13 @@ defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>; defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>; - defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; - defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; - defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", AArch64mla_m1>; - defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", AArch64mls_m1>; + defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad, "MLA_ZPmZZ", /*isReverseInstr*/ 1>; + defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb, "MLS_ZPmZZ", /*isReverseInstr*/ 1>; + defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", AArch64mla_m1, "MLA_ZPZZZ", "MAD_ZPmZZ">; + defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", AArch64mls_m1, "MLS_ZPZZZ", "MSB_ZPmZZ">; + + defm MLA_ZPZZZ : sve_int_3op_p_mladdsub ; + defm MLS_ZPZZZ : sve_int_3op_p_mladdsub ; // SVE predicated integer reductions. defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", AArch64saddv_p>; Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -3123,11 +3123,16 @@ let hasSideEffects = 0; } -multiclass sve_int_mladdsub_vvv_pred opc, string asm, SDPatternOperator op> { - def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>; - def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>; - def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>; - def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>; +multiclass sve_int_mladdsub_vvv_pred opc, string asm, SDPatternOperator op, + string revname, bit isReverseInstr=0> { + def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>, + SVEInstr2Rev; + def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>, + SVEInstr2Rev; + def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>, + SVEInstr2Rev; + def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>, + SVEInstr2Rev; def : SVE_4_Op_Pat(NAME # _B)>; def : SVE_4_Op_Pat(NAME # _H)>; @@ -3156,16 +3161,21 @@ let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = DestructiveOther; + let DestructiveInstType = DestructiveTernaryCommWithRev; let ElementSize = zprty.ElementSize; let hasSideEffects = 0; } -multiclass sve_int_mlas_vvv_pred opc, string asm, SDPatternOperator op> { - def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>; - def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>; - def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>; - def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>; +multiclass sve_int_mlas_vvv_pred opc, string asm, SDPatternOperator op, + string Ps, string revname, bit isReverseInstr=0> { + def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; def : SVE_4_Op_Pat(NAME # _B)>; def : SVE_4_Op_Pat(NAME # _H)>; @@ -3173,6 +3183,19 @@ def : SVE_4_Op_Pat(NAME # _D)>; } +//class for generating pseudo for SVE MLA/MAD/MLS/MSB +multiclass sve_int_3op_p_mladdsub { + def _UNDEF_B : PredThreeOpPseudo; + def _UNDEF_H : PredThreeOpPseudo; + def _UNDEF_S : PredThreeOpPseudo; + def _UNDEF_D : PredThreeOpPseudo; + + def : SVE_4_Op_Pat(NAME # _UNDEF_B)>; + def : SVE_4_Op_Pat(NAME # _UNDEF_H)>; + def : SVE_4_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_4_Op_Pat(NAME # _UNDEF_D)>; +} + //===----------------------------------------------------------------------===// // SVE2 Integer Multiply-Add - Unpredicated Group //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll +++ llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll @@ -72,19 +72,18 @@ ; CHECK-LABEL: scatter_f16_index_offset_var: ; CHECK: // %bb.0: ; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: mov z3.d, x1 -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, x1 ; CHECK-NEXT: incd z2.d -; CHECK-NEXT: mla z3.d, p1/m, z1.d, z3.d -; CHECK-NEXT: mla z4.d, p1/m, z2.d, z4.d +; CHECK-NEXT: mad z1.d, p1/m, z3.d, z3.d +; CHECK-NEXT: mad z2.d, p1/m, z3.d, z3.d ; CHECK-NEXT: punpklo p1.h, p0.b -; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpklo z3.d, z0.s ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: st1h { z1.d }, p1, [x0, z3.d, lsl #1] -; CHECK-NEXT: st1h { z0.d }, p0, [x0, z4.d, lsl #1] +; CHECK-NEXT: st1h { z3.d }, p1, [x0, z1.d, lsl #1] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, lsl #1] ; CHECK-NEXT: ret %t0 = insertelement undef, i64 %offset, i32 0 %t1 = shufflevector %t0, undef, zeroinitializer Index: llvm/test/CodeGen/AArch64/sve-int-arith.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -343,8 +343,7 @@ ; CHECK-LABEL: mad_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mla z2.b, p0/m, z0.b, z1.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b ; CHECK-NEXT: ret %prod = mul %a, %b %res = add %c, %prod @@ -355,8 +354,7 @@ ; CHECK-LABEL: mad_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mla z2.h, p0/m, z0.h, z1.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret %prod = mul %a, %b %res = add %c, %prod @@ -367,8 +365,7 @@ ; CHECK-LABEL: mad_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mla z2.s, p0/m, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret %prod = mul %a, %b %res = add %c, %prod @@ -379,8 +376,7 @@ ; CHECK-LABEL: mad_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mla z2.d, p0/m, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret %prod = mul %a, %b %res = add %c, %prod @@ -451,8 +447,7 @@ ; CHECK-LABEL: msb_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mls z2.b, p0/m, z0.b, z1.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: msb z0.b, p0/m, z1.b, z2.b ; CHECK-NEXT: ret %prod = mul %a, %b %res = sub %c, %prod @@ -463,8 +458,7 @@ ; CHECK-LABEL: msb_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mls z2.h, p0/m, z0.h, z1.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: msb z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret %prod = mul %a, %b %res = sub %c, %prod @@ -475,8 +469,7 @@ ; CHECK-LABEL: msb_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mls z2.s, p0/m, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: msb z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret %prod = mul %a, %b %res = sub %c, %prod @@ -487,8 +480,7 @@ ; CHECK-LABEL: msb_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mls z2.d, p0/m, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: msb z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret %prod = mul %a, %b %res = sub %c, %prod @@ -546,8 +538,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff -; CHECK-NEXT: mla z2.d, p0/m, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret { %1 = mul %a, %b @@ -560,8 +551,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff00000001 -; CHECK-NEXT: mla z2.d, p0/m, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret { %1 = mul %a, %b @@ -575,8 +565,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0x10000 -; CHECK-NEXT: mla z2.s, p0/m, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret { %1 = mul %a, %b @@ -589,8 +578,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0xffff0000 -; CHECK-NEXT: mla z2.s, p0/m, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret { %1 = mul %a, %b @@ -616,8 +604,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, #-255 // =0xffffffffffffff01 -; CHECK-NEXT: mla z2.h, p0/m, z0.h, z1.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret { %1 = mul %a, %b Index: llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir =================================================================== --- llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir +++ llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir @@ -20,3 +20,23 @@ RET_ReallyLR ... + +# CHECK: {{.*}} MSB_ZPmZZ_B {{.*}} +--- +name: expand_mls_to_msb +body: | + bb.0: + renamable $p0 = PTRUE_B 31 + renamable $z0 = MLS_ZPZZZ_UNDEF_B killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1 + RET_ReallyLR implicit $z0 +... + +# CHECK: {{.*}} MAD_ZPmZZ_B {{.*}} +--- +name: expand_mla_to_mad +body: | + bb.0: + renamable $p0 = PTRUE_B 31 + renamable $z0 = MLA_ZPZZZ_UNDEF_B killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1 + RET_ReallyLR implicit $z0 +... Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -109,10 +109,10 @@ define void @srem_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: srem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: sunpklo z7.h, z0.b ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 @@ -120,9 +120,9 @@ ; CHECK-NEXT: sunpklo z18.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: mov z4.d, z2.d -; CHECK-NEXT: sunpklo z6.h, z2.b -; CHECK-NEXT: ext z4.b, z4.b, z2.b, #8 +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sunpklo z6.h, z1.b +; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 ; CHECK-NEXT: sunpklo z16.s, z6.h ; CHECK-NEXT: sunpklo z4.h, z4.b ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 @@ -139,9 +139,9 @@ ; CHECK-NEXT: splice z17.h, p1, z17.h, z4.h ; CHECK-NEXT: sunpklo z4.s, z7.h ; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: mov z7.d, z2.d ; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z2.b, #8 ; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: sunpklo z6.h, z6.b ; CHECK-NEXT: sunpklo z7.h, z7.b @@ -161,7 +161,7 @@ ; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h ; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h ; CHECK-NEXT: sunpklo z4.h, z3.b -; CHECK-NEXT: sunpklo z6.h, z1.b +; CHECK-NEXT: sunpklo z6.h, z2.b ; CHECK-NEXT: sunpklo z16.s, z4.h ; CHECK-NEXT: sunpklo z18.s, z6.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 @@ -181,9 +181,9 @@ ; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: splice z7.b, p0, z7.b, z4.b ; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b -; CHECK-NEXT: mls z1.b, p1/m, z7.b, z3.b -; CHECK-NEXT: mls z0.b, p1/m, z5.b, z2.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: mls z2.b, p1/m, z7.b, z3.b +; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b +; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -492,10 +492,10 @@ define void @urem_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: urem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: uunpklo z7.h, z0.b ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 @@ -503,9 +503,9 @@ ; CHECK-NEXT: uunpklo z18.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: mov z4.d, z2.d -; CHECK-NEXT: uunpklo z6.h, z2.b -; CHECK-NEXT: ext z4.b, z4.b, z2.b, #8 +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: uunpklo z6.h, z1.b +; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 ; CHECK-NEXT: uunpklo z16.s, z6.h ; CHECK-NEXT: uunpklo z4.h, z4.b ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 @@ -522,9 +522,9 @@ ; CHECK-NEXT: splice z17.h, p1, z17.h, z4.h ; CHECK-NEXT: uunpklo z4.s, z7.h ; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: mov z7.d, z2.d ; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z2.b, #8 ; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: uunpklo z7.h, z7.b @@ -544,7 +544,7 @@ ; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h ; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h ; CHECK-NEXT: uunpklo z4.h, z3.b -; CHECK-NEXT: uunpklo z6.h, z1.b +; CHECK-NEXT: uunpklo z6.h, z2.b ; CHECK-NEXT: uunpklo z16.s, z4.h ; CHECK-NEXT: uunpklo z18.s, z6.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 @@ -564,9 +564,9 @@ ; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: splice z7.b, p0, z7.b, z4.b ; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b -; CHECK-NEXT: mls z1.b, p1/m, z7.b, z3.b -; CHECK-NEXT: mls z0.b, p1/m, z5.b, z2.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: mls z2.b, p1/m, z7.b, z3.b +; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b +; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b