Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1222,6 +1222,8 @@ bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, LLT Ty2) const override; + + bool preferScalarizeSplat(unsigned Opc) const override; }; namespace AArch64 { Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24495,3 +24495,10 @@ return nullptr; } + +bool AArch64TargetLowering::preferScalarizeSplat(unsigned Opc) const { + if (Subtarget->forceStreamingCompatibleSVE() && + (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND)) + return false; + return true; +} Index: llvm/test/CodeGen/AArch64/aarch64-force-streaming-compatible-sve.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-force-streaming-compatible-sve.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve -force-streaming-compatible-sve | FileCheck %s --check-prefix=CHECK + +define void @jpeg_add_quant_table(i32 %0, <8 x i64> %1, ptr %2) { +; CHECK-LABEL: jpeg_add_quant_table: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z4.s, w0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: sunpklo z4.d, z4.s +; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mul z2.d, p0/m, z2.d, z4.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: mul z3.d, p0/m, z3.d, z4.d +; CHECK-NEXT: cmpgt p2.d, p0/z, z2.d, #0 +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: cmpgt p1.d, p0/z, z3.d, #0 +; CHECK-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpgt p2.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpgt p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z0.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s +; CHECK-NEXT: ptrue p1.s, vl2 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: splice z3.s, p1, z3.s, z2.s +; CHECK-NEXT: splice z1.s, p1, z1.s, z0.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z0.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %4 = sext i32 %0 to i64 + %5 = insertelement <8 x i64> zeroinitializer, i64 %4, i64 0 + %6 = shufflevector <8 x i64> %5, <8 x i64> zeroinitializer, <8 x i32> zeroinitializer + %7 = mul <8 x i64> %6, %1 + %.not = icmp sgt <8 x i64> %7, zeroinitializer + %8 = zext <8 x i1> %.not to <8 x i16> + store <8 x i16> %8, ptr %2, align 2 + ret void +}