Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -437,8 +437,6 @@ DOPRegIsUnique = true; break; } - - assert (DOPRegIsUnique && "The destructive operand should be unique"); #endif // Resolve the reverse opcode @@ -451,28 +449,32 @@ // Get the right MOVPRFX uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode); - unsigned MovPrfx, MovPrfxZero, MovPrfxMerge; + unsigned MovPrfx, MovPrfxZero, MovPrfxMerge, Sel; switch (ElementSize) { case AArch64::ElementSizeNone: case AArch64::ElementSizeB: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B; MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_B; + Sel = AArch64::SEL_ZPZZ_B; break; case AArch64::ElementSizeH: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H; MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_H; + Sel = AArch64::SEL_ZPZZ_H; break; case AArch64::ElementSizeS: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S; MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_S; + Sel = AArch64::SEL_ZPZZ_S; break; case AArch64::ElementSizeD: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D; MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_D; + Sel = AArch64::SEL_ZPZZ_D; break; default: llvm_unreachable("Unsupported ElementSize"); @@ -482,7 +484,14 @@ // Create the destructive operation (if required) // MachineInstrBuilder PRFX, DOP; - if (FalseLanes == AArch64::FalseLanesZero) { + + if (FalseLanes == AArch64::FalseLanesZero && !DOPRegIsUnique) { + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Sel)) + .addReg(DstReg, RegState::Define) + .addReg(MI.getOperand(PredIdx).getReg()) + .addReg(MI.getOperand(DOPIdx).getReg()) + .addReg(MI.getOperand(PassIdx).getReg()); + } else if (FalseLanes == AArch64::FalseLanesZero) { assert(ElementSize != AArch64::ElementSizeNone && "This instruction is unpredicated"); Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -162,10 +162,11 @@ return false; } - bool SelectDupZero(SDValue N) { + bool SelectDupZero(SDValue N, SDValue &Res) { switch(N->getOpcode()) { case AArch64ISD::DUP: case ISD::SPLAT_VECTOR: { + Res = N; auto Opnd0 = N->getOperand(0); if (auto CN = dyn_cast(Opnd0)) if (CN->isNullValue()) Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -379,7 +379,7 @@ : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))), (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>; -def SVEDup0 : ComplexPattern; +def SVEDup0 : ComplexPattern; def SVEDup0Undef : ComplexPattern; let AddedComplexity = 1 in { @@ -393,6 +393,11 @@ : Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), (inst $Op1, $Op2, $Op3)>; +class SVE_3_Op_Pat_SelZero_Passthru +: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (vt2 SVEDup0:$Dup)), vt3:$Op3))), + (inst $Op1, $Op2, $Op3, $Dup)>; + class SVE_3_Op_Pat_Shift_Imm_SelZero @@ -476,6 +481,12 @@ let FalseLanes = flags; let Constraints = "$Zd = $Zpt"; } + + class PredTwoOpMergeZero + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zpt), []> { + let FalseLanes = FalseLanesZero; + } } //===----------------------------------------------------------------------===// @@ -1616,13 +1627,13 @@ } multiclass sve_fp_2op_p_zds_zx { - def _ZERO_H : PredTwoOpPseudo; - def _ZERO_S : PredTwoOpPseudo; - def _ZERO_D : PredTwoOpPseudo; + def _ZERO_H : PredTwoOpMergeZero; + def _ZERO_S : PredTwoOpMergeZero; + def _ZERO_D : PredTwoOpMergeZero; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_D)>; def _H : PredTwoOpMergePseudo; def _S : PredTwoOpMergePseudo; Index: llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll @@ -280,6 +280,26 @@ ret %out } +; This tests currently breaks on master, because the register allcoation ends up as: +; Dst = FSUB_ZERO_S P0, Z0, Z0 +; And the expand pass cannot zero the false lanes of Z0 using MOVPRFX, because the +; instruction specifies that the destination register must not be used in any other +; operand position than the destination register, so: +; Z0 = MOVPRFX P0/z, Z0 +; Z0 = FSUB_S Z0, P0/m, Z0 +; would not be valid. Hence the need to use a SELECT of Z0 and DUP(0). +define @fsub_zero_z0_z0( %p, %z0) { +; CHECK-LABEL: fsub_zero_z0_z0 +; CHECK: mov z1.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: fsubr z0.s, p0/m, z0.s, z0.s +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %add = call @llvm.aarch64.sve.fsub.nxv4f32( %p, %z0_in, %z0) + ret %add +} + + ; ; FSUBR ;