Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14649,6 +14649,10 @@ if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) { return false; } + // A 64bit st2 which does not start at element 0 will involved adding extra + // ext elements, making the st2 unprofitable. + if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0) + return false; Type *PtrTy = UseScalable Index: llvm/test/CodeGen/AArch64/vldn_shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -301,21 +301,18 @@ ; CHECK-LABEL: transpose_s16_8x8_: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: ldp q2, q3, [x0, #32] ; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: ldp q5, q6, [x0, #80] +; CHECK-NEXT: ldp q4, q5, [x0, #64] ; CHECK-NEXT: trn1 v2.8h, v2.8h, v3.8h -; CHECK-NEXT: ldr q4, [x8, #64]! -; CHECK-NEXT: ldr q1, [x0, #112] +; CHECK-NEXT: ldp q6, q1, [x0, #96] ; CHECK-NEXT: trn1 v3.8h, v4.8h, v5.8h -; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h ; CHECK-NEXT: trn1 v3.4s, v0.4s, v3.4s +; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h ; CHECK-NEXT: trn1 v4.4s, v2.4s, v1.4s -; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: ext v1.16b, v4.16b, v4.16b, #8 +; CHECK-NEXT: zip2 v0.4s, v3.4s, v4.4s ; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x0] -; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x8] +; CHECK-NEXT: str q0, [x0, #64] ; CHECK-NEXT: ret entry: %0 = load <8 x i16>, ptr %a, align 16 @@ -355,21 +352,18 @@ ; CHECK-LABEL: transpose_s16_8x82: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: ldp q3, q4, [x0, #32] ; CHECK-NEXT: mov v0.h[5], v2.h[4] -; CHECK-NEXT: ldp q6, q7, [x0, #80] +; CHECK-NEXT: ldp q5, q6, [x0, #64] ; CHECK-NEXT: zip1 v3.8h, v3.8h, v4.8h -; CHECK-NEXT: ldr q5, [x8, #64]! -; CHECK-NEXT: ldr q2, [x0, #112] +; CHECK-NEXT: ldp q7, q2, [x0, #96] ; CHECK-NEXT: zip1 v4.8h, v5.8h, v6.8h -; CHECK-NEXT: mov v7.h[5], v2.h[4] ; CHECK-NEXT: mov v0.s[1], v4.s[0] +; CHECK-NEXT: mov v7.h[5], v2.h[4] ; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0] -; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x8] +; CHECK-NEXT: str q2, [x0, #64] ; CHECK-NEXT: ret entry: %0 = load <8 x i16>, ptr %a, align 16 @@ -424,11 +418,10 @@ ; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: trn1 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: zip1 v1.4s, v2.4s, v0.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: trn1 v1.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v2.4s, v0.4s ; CHECK-NEXT: str q1, [x0] -; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x1] +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> @@ -443,11 +436,10 @@ ; CHECK-LABEL: store_factor2_high2: ; CHECK: // %bb.0: ; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: trn1 v0.4s, v2.4s, v1.4s -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x1] +; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: trn1 v2.4s, v2.4s, v1.4s +; CHECK-NEXT: str q2, [x0] +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32>