Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -348,13 +348,25 @@ /// \brief Expand Pseudos to Instructions with destructive operands. /// -/// This mechanism uses MOVPRFX instructions for zeroing the false lanes -/// or for fixing relaxed register allocation conditions to comply with +/// This mechanism uses MOVPRFX instructions for merging/zeroing the false +/// lanes or for fixing relaxed register allocation conditions to comply with /// the instructions register constraints. The latter case may be cheaper /// than setting the register constraints in the register allocator, /// since that will insert regular MOV instructions rather than MOVPRFX. /// -/// Example (after register allocation): +/// Merging example (after register allocation): +/// +/// FADD_ZPZZ_B Z0, Pg, Z0, Z1, Z2 +/// +/// * The Pseudo FADD_ZPZZ_B maps to FADD_ZPmZ_B, where Z2 is the +/// Passthru register. +/// * We cannot map directly to FADD_ZPmZ_B because we need to +/// carry the explicit passthru register. +/// * FIXME: Register constraints when they're determined. +/// * For performance, it's prefered to use the zero/undef merging +/// variants. +/// +/// Zeroing example (after register allocation): /// /// FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0 /// @@ -379,9 +391,8 @@ /// MOVPRFX_ZPzZ_B Z0, Pg/z, Z0 /// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 /// -/// Note that this can only be done for _ZERO or _UNDEF variants where -/// we can guarantee the false lanes to be zeroed (by implementing this) -/// or that they are undef (don't care / not used), otherwise the +/// Note that this can only be done for merging variants where +/// we can guarantee the false lanes are specified, otherwise the /// swapping of operands is illegal because the operation is not /// (or cannot be emulated to be) fully commutative. bool AArch64ExpandPseudo::expand_DestructiveOp( @@ -391,7 +402,6 @@ unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode()); uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask; uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask; - bool FalseZero = FalseLanes == AArch64::FalseLanesZero; unsigned DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); @@ -400,21 +410,21 @@ assert(DstReg != MI.getOperand(3).getReg()); bool UseRev = false; - unsigned PredIdx, DOPIdx, SrcIdx; + unsigned PredIdx, DOPIdx, SrcIdx, PassthruIdx; switch (DType) { case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: if (DstReg == MI.getOperand(3).getReg()) { // FSUB Zd, Pg, Zs1, Zd ==> FSUBR Zd, Pg/m, Zd, Zs1 - std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2); + std::tie(PredIdx, DOPIdx, SrcIdx, PassthruIdx) = std::make_tuple(1, 3, 2, 4); UseRev = true; break; } LLVM_FALLTHROUGH; case AArch64::DestructiveBinary: case AArch64::DestructiveBinaryImm: - std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); - break; + std::tie(PredIdx, DOPIdx, SrcIdx, PassthruIdx) = std::make_tuple(1, 2, 3, 4); + break; default: llvm_unreachable("Unsupported Destructive Operand type"); } @@ -449,24 +459,28 @@ // Get the right MOVPRFX uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode); - unsigned MovPrfx, MovPrfxZero; + unsigned MovPrfx, MovPrfxZero, MovPrfxMerge; switch (ElementSize) { case AArch64::ElementSizeNone: case AArch64::ElementSizeB: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_B; break; case AArch64::ElementSizeH: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_H; break; case AArch64::ElementSizeS: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_S; break; case AArch64::ElementSizeD: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_D; break; default: llvm_unreachable("Unsupported ElementSize"); @@ -476,7 +490,7 @@ // Create the destructive operation (if required) // MachineInstrBuilder PRFX, DOP; - if (FalseZero) { + if (FalseLanes == AArch64::FalseLanesZero) { assert(ElementSize != AArch64::ElementSizeNone && "This instruction is unpredicated"); @@ -488,6 +502,16 @@ // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; + } else if (FalseLanes == AArch64::FalseLanesMerge) { + // FIXME: We might not be able to omit the MOVPRFX for the + // zero merging case. + if (DstReg != MI.getOperand(DOPIdx).getReg()) + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxMerge)) + .addReg(MI.getOperand(PassthruIdx).getReg(), RegState::Define) + .addReg(MI.getOperand(PassthruIdx).getReg()) + .addReg(MI.getOperand(PredIdx).getReg()) + .addReg(MI.getOperand(DOPIdx).getReg()); + DOPIdx = 0; } else if (DstReg != MI.getOperand(DOPIdx).getReg()) { PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) .addReg(DstReg, RegState::Define) Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -37,12 +37,13 @@ def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>; def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>; -class FalseLanesEnum val> { - bits<2> Value = val; +class FalseLanesEnum val> { + bits<3> Value = val; } def FalseLanesNone : FalseLanesEnum<0>; def FalseLanesZero : FalseLanesEnum<1>; def FalseLanesUndef : FalseLanesEnum<2>; +def FalseLanesMerge : FalseLanesEnum<3>; // AArch64 Instruction Format class AArch64Inst : Instruction { @@ -64,7 +65,7 @@ DestructiveInstTypeEnum DestructiveInstType = NotDestructive; ElementSizeEnum ElementSize = ElementSizeNone; - let TSFlags{8-7} = FalseLanes.Value; + let TSFlags{9-7} = FalseLanes.Value; let TSFlags{6-3} = DestructiveInstType.Value; let TSFlags{2-0} = ElementSize.Value; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -420,9 +420,10 @@ }; enum FalseLaneType { - FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x3), - FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1), + FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x3), + FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1), FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2), + FalseLanesMerge = TSFLAG_FALSE_LANE_TYPE(0x3), }; #undef TSFLAG_ELEMENT_SIZE_TYPE Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -377,6 +377,11 @@ def SVEDup0Undef : ComplexPattern; let AddedComplexity = 1 in { +class SVE_3_Op_Pat_Sel +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, vt2:$Passthru), vt3:$Op3)), + (inst $Op1, $Op2, $Op3, $Passthru)>; + class SVE_3_Op_Pat_SelZero : Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), @@ -457,6 +462,14 @@ Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { let FalseLanes = flags; } + + class PredTwoOpMergePseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zpt), []> { + let FalseLanes = flags; + let Constraints = "$Zd = $Zpt"; + } } //===----------------------------------------------------------------------===// @@ -4866,6 +4879,16 @@ def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; + + def _B : PredTwoOpMergePseudo; + def _H : PredTwoOpMergePseudo; + def _S : PredTwoOpMergePseudo; + def _D : PredTwoOpMergePseudo; + + def : SVE_3_Op_Pat_Sel(NAME # _B)>; + def : SVE_3_Op_Pat_Sel(NAME # _H)>; + def : SVE_3_Op_Pat_Sel(NAME # _S)>; + def : SVE_3_Op_Pat_Sel(NAME # _D)>; } multiclass sve_int_bin_pred_shift_wide opc, string asm, Index: llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll @@ -52,6 +52,114 @@ ret %out } +define @add_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: add z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @add_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i16 +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: add z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @add_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: add z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @add_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: add z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + +define @add_i8_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i8_comm: +; CHECK: add z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @add_i16_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i16 +; CHECK: add z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @add_i32_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i32_comm: +; CHECK: add z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @add_i64_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i64_comm: +; CHECK: add z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; SUB ; @@ -104,6 +212,62 @@ ret %out } +define @sub_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: sub z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @sub_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i16 +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: sub z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @sub_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: sub z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @sub_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: sub z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; SUBR ; @@ -156,6 +320,114 @@ ret %out } +define @subr_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: subr z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i16 +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: subr z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: subr z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: subr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i8_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i8_rev: +; CHECK: subr z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i16_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i16_rev: +; CHECK: subr z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i32_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i32_rev: +; CHECK: subr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i64_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i64_rev: +; CHECK: subr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + declare @llvm.aarch64.sve.add.nxv16i8(, , ) declare @llvm.aarch64.sve.add.nxv8i16(, , ) declare @llvm.aarch64.sve.add.nxv4i32(, , ) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll @@ -85,6 +85,62 @@ ret %out } +define @asr_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: asr z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @asr_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i16: +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: asr z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @asr_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @asr_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: asr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; ASRD ; @@ -222,6 +278,62 @@ ret %out } +define @lsl_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @lsl_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i16: +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @lsl_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @lsl_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; LSR ; @@ -307,6 +419,62 @@ ret %out } +define @lsr_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: lsr z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @lsr_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i16: +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: lsr z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @lsr_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: lsr z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @lsr_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + declare @llvm.aarch64.sve.asr.nxv16i8(, , ) declare @llvm.aarch64.sve.asr.nxv8i16(, , ) declare @llvm.aarch64.sve.asr.nxv4i32(, , )