Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5220,10 +5220,11 @@ } bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + bool OverrideNEON = Subtarget->useSVEForFixedLengthVectors(); return ExtVal.getValueType().isScalableVector() || - useSVEForFixedLengthVectorVT( - ExtVal.getValueType(), - /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()); + useSVEForFixedLengthVectorVT(ExtVal.getValueType(), OverrideNEON) || + (OverrideNEON && ExtVal.getValueType().getFixedSizeInBits() > + Subtarget->getMinSVEVectorSizeInBits()); } unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { Index: llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -73,13 +73,10 @@ define <64 x i32> @load_zext_v64i16i32(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_zext_v64i16i32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 -; VBITS_GE_1024-NEXT: mov x9, #32 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64 -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: ld1h { z1.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] ; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8] ; VBITS_GE_1024-NEXT: ret @@ -145,13 +142,10 @@ define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_sext_v64i16i32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 -; VBITS_GE_1024-NEXT: mov x9, #32 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64 -; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: ld1sh { z1.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] ; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8] ; VBITS_GE_1024-NEXT: ret @@ -170,17 +164,11 @@ define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_zext_v32i8i64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.b, vl32 -; VBITS_GE_1024-NEXT: mov x9, #16 -; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: uunpklo z1.h, z0.b -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [x0, x9] +; VBITS_GE_1024-NEXT: ld1b { z1.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] ; VBITS_GE_1024-NEXT: ret @@ -199,17 +187,11 @@ define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_sext_v32i8i64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.b, vl32 -; VBITS_GE_1024-NEXT: mov x9, #16 -; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: sunpklo z1.h, z0.b -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_1024-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1sb { z0.d }, p0/z, [x0, x9] +; VBITS_GE_1024-NEXT: ld1sb { z1.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] ; VBITS_GE_1024-NEXT: ret @@ -228,15 +210,10 @@ define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_zext_v32i16i64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 -; VBITS_GE_1024-NEXT: mov x9, #16 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: ld1h { z1.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] ; VBITS_GE_1024-NEXT: ret @@ -255,15 +232,10 @@ define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_sext_v32i16i64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 -; VBITS_GE_1024-NEXT: mov x9, #16 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: ld1sh { z1.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] ; VBITS_GE_1024-NEXT: ret @@ -282,13 +254,10 @@ define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_zext_v32i32i64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov x9, #16 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64 -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_1024-NEXT: ld1w { z1.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] ; VBITS_GE_1024-NEXT: ret @@ -307,13 +276,10 @@ define <32 x i64> @load_sext_v32i32i64(ptr %ap) #0 { ; VBITS_GE_1024-LABEL: load_sext_v32i32i64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov x9, #16 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: sunpklo z1.d, z0.s -; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64 -; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_1024-NEXT: ld1sw { z1.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] ; VBITS_GE_1024-NEXT: ret Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -86,42 +86,35 @@ define void @smulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: sunpklo z4.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: sunpklo z5.h, z1.b -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: sunpklo z6.h, z2.b -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: sunpklo z7.h, z3.b -; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 -; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b -; VBITS_GE_256-NEXT: sunpklo z2.h, z2.b -; VBITS_GE_256-NEXT: sunpklo z3.h, z3.b -; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h -; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h -; VBITS_GE_256-NEXT: movprfx z2, z5 -; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h -; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h -; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 -; VBITS_GE_256-NEXT: lsr z3.h, z4.h, #8 -; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w10, #48 // =0x30 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1sb { z1.h }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1sb { z2.h }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1sb { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sb { z4.h }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1sb { z5.h }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1sb { z6.h }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1sb { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.h, p0/m, z2.h, z5.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z4.h +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z6.h +; VBITS_GE_256-NEXT: mul z3.h, p0/m, z3.h, z7.h ; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8 -; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b -; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b -; VBITS_GE_256-NEXT: ptrue p1.b, vl16 +; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b -; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b -; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b -; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: lsr z3.h, z3.h, #8 +; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b +; VBITS_GE_256-NEXT: splice z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] ; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; @@ -249,35 +242,36 @@ define void @smulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sh { z4.s }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z5.s }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z6.s }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z5.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z4.s +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z6.s +; VBITS_GE_256-NEXT: mul z3.s, p0/m, z3.s, z7.s +; VBITS_GE_256-NEXT: lsr z2.s, z2.s, #16 +; VBITS_GE_256-NEXT: lsr z1.s, z1.s, #16 +; VBITS_GE_256-NEXT: lsr z0.s, z0.s, #16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: lsr z3.s, z3.s, #16 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ptrue p1.h, vl8 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: mov z7.d, z1.d -; VBITS_GE_256-NEXT: mov z16.d, z3.d -; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16 -; VBITS_GE_256-NEXT: smull2 v4.4s, v0.8h, v2.8h -; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16 -; VBITS_GE_256-NEXT: smull v5.4s, v0.4h, v2.4h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: smull2 v6.4s, v1.8h, v3.8h -; VBITS_GE_256-NEXT: smull v1.4s, v1.4h, v3.4h -; VBITS_GE_256-NEXT: smull2 v3.4s, v0.8h, v2.8h -; VBITS_GE_256-NEXT: smull v0.4s, v0.4h, v2.4h -; VBITS_GE_256-NEXT: smull2 v2.4s, v7.8h, v16.8h -; VBITS_GE_256-NEXT: smull v7.4s, v7.4h, v16.4h -; VBITS_GE_256-NEXT: uzp2 v4.8h, v5.8h, v4.8h -; VBITS_GE_256-NEXT: uzp2 v1.8h, v1.8h, v6.8h -; VBITS_GE_256-NEXT: uzp2 v0.8h, v0.8h, v3.8h -; VBITS_GE_256-NEXT: uzp2 v2.8h, v7.8h, v2.8h -; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h -; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h -; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smulh_v32i16: @@ -396,35 +390,36 @@ define void @smulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #12 // =0xc +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sw { z4.d }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z5.d }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z6.d }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z5.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z4.d +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z6.d +; VBITS_GE_256-NEXT: mul z3.d, p0/m, z3.d, z7.d +; VBITS_GE_256-NEXT: lsr z2.d, z2.d, #32 +; VBITS_GE_256-NEXT: lsr z1.d, z1.d, #32 +; VBITS_GE_256-NEXT: lsr z0.d, z0.d, #32 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: lsr z3.d, z3.d, #32 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ptrue p1.s, vl4 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: mov z7.d, z1.d -; VBITS_GE_256-NEXT: mov z16.d, z3.d -; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16 -; VBITS_GE_256-NEXT: smull2 v4.2d, v0.4s, v2.4s -; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16 -; VBITS_GE_256-NEXT: smull v5.2d, v0.2s, v2.2s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: smull2 v6.2d, v1.4s, v3.4s -; VBITS_GE_256-NEXT: smull v1.2d, v1.2s, v3.2s -; VBITS_GE_256-NEXT: smull2 v3.2d, v0.4s, v2.4s -; VBITS_GE_256-NEXT: smull v0.2d, v0.2s, v2.2s -; VBITS_GE_256-NEXT: smull2 v2.2d, v7.4s, v16.4s -; VBITS_GE_256-NEXT: smull v7.2d, v7.2s, v16.2s -; VBITS_GE_256-NEXT: uzp2 v4.4s, v5.4s, v4.4s -; VBITS_GE_256-NEXT: uzp2 v1.4s, v1.4s, v6.4s -; VBITS_GE_256-NEXT: uzp2 v0.4s, v0.4s, v3.4s -; VBITS_GE_256-NEXT: uzp2 v2.4s, v7.4s, v2.4s -; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s -; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s -; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smulh_v16i32: @@ -547,7 +542,7 @@ define void @smulh_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ptrue p1.d, vl2 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] @@ -735,42 +730,35 @@ define void @umulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: uunpklo z4.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: uunpklo z5.h, z1.b -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: uunpklo z6.h, z2.b -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: uunpklo z7.h, z3.b -; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 -; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b -; VBITS_GE_256-NEXT: uunpklo z2.h, z2.b -; VBITS_GE_256-NEXT: uunpklo z3.h, z3.b -; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h -; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h -; VBITS_GE_256-NEXT: movprfx z2, z5 -; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h -; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h -; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 -; VBITS_GE_256-NEXT: lsr z3.h, z4.h, #8 -; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w10, #48 // =0x30 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.h }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.h }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z4.h }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z5.h }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z6.h }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.h, p0/m, z2.h, z5.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z4.h +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z6.h +; VBITS_GE_256-NEXT: mul z3.h, p0/m, z3.h, z7.h ; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8 -; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b -; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b -; VBITS_GE_256-NEXT: ptrue p1.b, vl16 +; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b -; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b -; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b -; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: lsr z3.h, z3.h, #8 +; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b +; VBITS_GE_256-NEXT: splice z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] ; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; @@ -898,35 +886,36 @@ define void @umulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.s }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.s }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.s }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z5.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z4.s +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z6.s +; VBITS_GE_256-NEXT: mul z3.s, p0/m, z3.s, z7.s +; VBITS_GE_256-NEXT: lsr z2.s, z2.s, #16 +; VBITS_GE_256-NEXT: lsr z1.s, z1.s, #16 +; VBITS_GE_256-NEXT: lsr z0.s, z0.s, #16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: lsr z3.s, z3.s, #16 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ptrue p1.h, vl8 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: mov z7.d, z1.d -; VBITS_GE_256-NEXT: mov z16.d, z3.d -; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16 -; VBITS_GE_256-NEXT: umull2 v4.4s, v0.8h, v2.8h -; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16 -; VBITS_GE_256-NEXT: umull v5.4s, v0.4h, v2.4h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: umull2 v6.4s, v1.8h, v3.8h -; VBITS_GE_256-NEXT: umull v1.4s, v1.4h, v3.4h -; VBITS_GE_256-NEXT: umull2 v3.4s, v0.8h, v2.8h -; VBITS_GE_256-NEXT: umull v0.4s, v0.4h, v2.4h -; VBITS_GE_256-NEXT: umull2 v2.4s, v7.8h, v16.8h -; VBITS_GE_256-NEXT: umull v7.4s, v7.4h, v16.4h -; VBITS_GE_256-NEXT: uzp2 v4.8h, v5.8h, v4.8h -; VBITS_GE_256-NEXT: uzp2 v1.8h, v1.8h, v6.8h -; VBITS_GE_256-NEXT: uzp2 v0.8h, v0.8h, v3.8h -; VBITS_GE_256-NEXT: uzp2 v2.8h, v7.8h, v2.8h -; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h -; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h -; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umulh_v32i16: @@ -1047,35 +1036,36 @@ define void @umulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #12 // =0xc +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.d }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.d }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.d }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z5.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z4.d +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z6.d +; VBITS_GE_256-NEXT: mul z3.d, p0/m, z3.d, z7.d +; VBITS_GE_256-NEXT: lsr z2.d, z2.d, #32 +; VBITS_GE_256-NEXT: lsr z1.d, z1.d, #32 +; VBITS_GE_256-NEXT: lsr z0.d, z0.d, #32 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: lsr z3.d, z3.d, #32 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ptrue p1.s, vl4 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: mov z7.d, z1.d -; VBITS_GE_256-NEXT: mov z16.d, z3.d -; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16 -; VBITS_GE_256-NEXT: umull2 v4.2d, v0.4s, v2.4s -; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16 -; VBITS_GE_256-NEXT: umull v5.2d, v0.2s, v2.2s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: umull2 v6.2d, v1.4s, v3.4s -; VBITS_GE_256-NEXT: umull v1.2d, v1.2s, v3.2s -; VBITS_GE_256-NEXT: umull2 v3.2d, v0.4s, v2.4s -; VBITS_GE_256-NEXT: umull v0.2d, v0.2s, v2.2s -; VBITS_GE_256-NEXT: umull2 v2.2d, v7.4s, v16.4s -; VBITS_GE_256-NEXT: umull v7.2d, v7.2s, v16.2s -; VBITS_GE_256-NEXT: uzp2 v4.4s, v5.4s, v4.4s -; VBITS_GE_256-NEXT: uzp2 v1.4s, v1.4s, v6.4s -; VBITS_GE_256-NEXT: uzp2 v0.4s, v0.4s, v3.4s -; VBITS_GE_256-NEXT: uzp2 v2.4s, v7.4s, v2.4s -; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s -; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s -; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umulh_v16i32: @@ -1196,7 +1186,7 @@ define void @umulh_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ptrue p1.d, vl2 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -63,16 +63,14 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap) #0 { ; CHECK-LABEL: load_sext_v16i8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z4.h, z1.b -; CHECK-NEXT: sunpklo z0.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z2.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: sunpklo z1.s, z3.h -; CHECK-NEXT: sunpklo z3.s, z4.h +; CHECK-NEXT: mov w8, #4 // =0x4 +; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: mov w10, #12 // =0xc +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z2.s }, p0/z, [x0, x9] +; CHECK-NEXT: ld1sb { z3.s }, p0/z, [x0, x10] +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 @@ -86,10 +84,10 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap) #0 { ; CHECK-LABEL: load_sext_v8i16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: sunpklo z0.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret @@ -165,25 +163,22 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap) #0 { ; CHECK-LABEL: load_zext_v16i16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z7.s, z1.h -; CHECK-NEXT: uunpklo z0.d, z3.s -; CHECK-NEXT: uunpklo z5.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: uunpklo z16.s, z2.h -; CHECK-NEXT: uunpklo z4.d, z5.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: uunpklo z2.d, z7.s -; CHECK-NEXT: uunpklo z6.d, z16.s -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 -; CHECK-NEXT: uunpklo z1.d, z3.s -; CHECK-NEXT: uunpklo z5.d, z5.s -; CHECK-NEXT: uunpklo z3.d, z7.s -; CHECK-NEXT: uunpklo z7.d, z16.s +; CHECK-NEXT: mov x8, #2 // =0x2 +; CHECK-NEXT: mov x9, #4 // =0x4 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov x10, #6 // =0x6 +; CHECK-NEXT: mov x11, #8 // =0x8 +; CHECK-NEXT: mov x12, #10 // =0xa +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: mov x8, #12 // =0xc +; CHECK-NEXT: ld1h { z2.d }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: mov x9, #14 // =0xe +; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z4.d }, p0/z, [x0, x11, lsl #1] +; CHECK-NEXT: ld1h { z5.d }, p0/z, [x0, x12, lsl #1] +; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -71,39 +71,34 @@ define void @smulh_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smulh_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: mov w8, #8 // =0x8 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: mov w10, #24 // =0x18 ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: sunpklo z4.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: sunpklo z5.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z6.h, z3.b -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z7.h, z2.b -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: sunpklo z2.h, z2.b -; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h -; CHECK-NEXT: movprfx z2, z5 -; CHECK-NEXT: mul z2.h, p0/m, z2.h, z7.h -; CHECK-NEXT: movprfx z3, z4 -; CHECK-NEXT: mul z3.h, p0/m, z3.h, z6.h +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, x9] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0, x10] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z4.h }, p0/z, [x1, x9] +; CHECK-NEXT: ld1sb { z5.h }, p0/z, [x1, x10] +; CHECK-NEXT: ld1sb { z6.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z7.h }, p0/z, [x1] +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z4.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z6.h +; CHECK-NEXT: mul z3.h, p0/m, z3.h, z7.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 ; CHECK-NEXT: lsr z1.h, z1.h, #8 -; CHECK-NEXT: lsr z3.h, z3.h, #8 ; CHECK-NEXT: lsr z2.h, z2.h, #8 -; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z3.h, z3.h, #8 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b -; CHECK-NEXT: stp q3, q2, [x0] +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z0.b +; CHECK-NEXT: splice z1.b, p0, z1.b, z2.b +; CHECK-NEXT: stp q3, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -173,25 +168,34 @@ define void @smulh_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smulh_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: mov x9, #8 // =0x8 +; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z4.s }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1sh { z5.s }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1sh { z6.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z7.s }, p0/z, [x1] +; CHECK-NEXT: mul z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: mul z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: lsr z2.s, z2.s, #16 +; CHECK-NEXT: lsr z3.s, z3.s, #16 +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: mov z2.d, z3.d -; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: smulh z3.h, p0/m, z3.h, z6.h -; CHECK-NEXT: smulh z2.h, p0/m, z2.h, z4.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q3, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -241,25 +245,34 @@ define void @smulh_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smulh_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov x8, #2 // =0x2 +; CHECK-NEXT: mov x9, #4 // =0x4 +; CHECK-NEXT: mov x10, #6 // =0x6 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z4.d }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1sw { z5.d }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1sw { z6.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z7.d }, p0/z, [x1] +; CHECK-NEXT: mul z2.d, p0/m, z2.d, z5.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z6.d +; CHECK-NEXT: mul z3.d, p0/m, z3.d, z7.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: lsr z2.d, z2.d, #32 +; CHECK-NEXT: lsr z3.d, z3.d, #32 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: mov z2.d, z3.d -; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: smulh z3.s, p0/m, z3.s, z6.s -; CHECK-NEXT: smulh z2.s, p0/m, z2.s, z4.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: splice z3.s, p0, z3.s, z0.s ; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q3, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -410,39 +423,34 @@ define void @umulh_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umulh_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: mov w8, #8 // =0x8 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: mov w10, #24 // =0x18 ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: uunpklo z4.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: uunpklo z5.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z6.h, z3.b -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z7.h, z2.b -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h -; CHECK-NEXT: movprfx z2, z5 -; CHECK-NEXT: mul z2.h, p0/m, z2.h, z7.h -; CHECK-NEXT: movprfx z3, z4 -; CHECK-NEXT: mul z3.h, p0/m, z3.h, z6.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.h }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z5.h }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z6.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z7.h }, p0/z, [x1] +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z4.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z6.h +; CHECK-NEXT: mul z3.h, p0/m, z3.h, z7.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 ; CHECK-NEXT: lsr z1.h, z1.h, #8 -; CHECK-NEXT: lsr z3.h, z3.h, #8 ; CHECK-NEXT: lsr z2.h, z2.h, #8 -; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z3.h, z3.h, #8 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b -; CHECK-NEXT: stp q3, q2, [x0] +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z0.b +; CHECK-NEXT: splice z1.b, p0, z1.b, z2.b +; CHECK-NEXT: stp q3, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -512,25 +520,34 @@ define void @umulh_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umulh_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: mov x9, #8 // =0x8 +; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.s }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z5.s }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z6.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z7.s }, p0/z, [x1] +; CHECK-NEXT: mul z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: mul z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: lsr z2.s, z2.s, #16 +; CHECK-NEXT: lsr z3.s, z3.s, #16 +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z2.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: mov z2.d, z3.d -; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z6.h -; CHECK-NEXT: umulh z2.h, p0/m, z2.h, z4.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q3, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -580,25 +597,34 @@ define void @umulh_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umulh_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov x8, #2 // =0x2 +; CHECK-NEXT: mov x9, #4 // =0x4 +; CHECK-NEXT: mov x10, #6 // =0x6 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.d }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z5.d }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z6.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z7.d }, p0/z, [x1] +; CHECK-NEXT: mul z2.d, p0/m, z2.d, z5.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z6.d +; CHECK-NEXT: mul z3.d, p0/m, z3.d, z7.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: lsr z2.d, z2.d, #32 +; CHECK-NEXT: lsr z3.d, z3.d, #32 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: mov z2.d, z3.d -; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z6.s -; CHECK-NEXT: umulh z2.s, p0/m, z2.s, z4.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: splice z3.s, p0, z3.s, z0.s ; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q3, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b