Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -447,15 +447,11 @@ uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask; uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask; bool FalseZero = FalseLanes == AArch64::FalseLanesZero; - Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); - - if (DType == AArch64::DestructiveBinary) - assert(DstReg != MI.getOperand(3).getReg()); - bool UseRev = false; unsigned PredIdx, DOPIdx, SrcIdx, Src2Idx; + switch (DType) { case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: @@ -489,12 +485,13 @@ llvm_unreachable("Unsupported Destructive Operand type"); } -#ifndef NDEBUG // MOVPRFX can only be used if the destination operand // is the destructive operand, not as any other operand, // so the Destructive Operand must be unique. bool DOPRegIsUnique = false; switch (DType) { + case AArch64::DestructiveBinary: + DOPRegIsUnique = DstReg != MI.getOperand(SrcIdx).getReg(); case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: DOPRegIsUnique = @@ -512,7 +509,6 @@ MI.getOperand(DOPIdx).getReg() != MI.getOperand(Src2Idx).getReg()); break; } -#endif // Resolve the reverse opcode if (UseRev) { @@ -527,23 +523,27 @@ // Get the right MOVPRFX uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode); - unsigned MovPrfx, MovPrfxZero; + unsigned MovPrfx, LSLZero, MovPrfxZero; switch (ElementSize) { case AArch64::ElementSizeNone: case AArch64::ElementSizeB: MovPrfx = AArch64::MOVPRFX_ZZ; + LSLZero = AArch64::LSL_ZPmI_B; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B; break; case AArch64::ElementSizeH: MovPrfx = AArch64::MOVPRFX_ZZ; + LSLZero = AArch64::LSL_ZPmI_H; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H; break; case AArch64::ElementSizeS: MovPrfx = AArch64::MOVPRFX_ZZ; + LSLZero = AArch64::LSL_ZPmI_S; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S; break; case AArch64::ElementSizeD: MovPrfx = AArch64::MOVPRFX_ZZ; + LSLZero = AArch64::LSL_ZPmI_D; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D; break; default: @@ -555,9 +555,10 @@ // MachineInstrBuilder PRFX, DOP; if (FalseZero) { -#ifndef NDEBUG - assert(DOPRegIsUnique && "The destructive operand should be unique"); -#endif + // If we cannot prefix the requested instruction we'll instead emit a + // prefixed_zeroing_mov for DestructiveBinary. + assert((DOPRegIsUnique || AArch64::DestructiveBinary == DType) && + "The destructive operand should be unique"); assert(ElementSize != AArch64::ElementSizeNone && "This instruction is unpredicated"); @@ -569,10 +570,19 @@ // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; + + // Create the additional LSL to zero the lanes when the DstReg is not + // unique. Zeros the lanes in z0 that aren't active in p0 with sequence + // movprfx z0.b, p0/z, z0.b; lsl z0.b, p0/m, z0.b, #0; + if (DType == AArch64::DestructiveBinary && !DOPRegIsUnique) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LSLZero)) + .addReg(DstReg, RegState::Define) + .add(MI.getOperand(PredIdx)) + .addReg(DstReg) + .addImm(0); + } } else if (DstReg != MI.getOperand(DOPIdx).getReg()) { -#ifndef NDEBUG assert(DOPRegIsUnique && "The destructive operand should be unique"); -#endif PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) .addReg(DstReg, RegState::Define) .addReg(MI.getOperand(DOPIdx).getReg()); @@ -591,6 +601,7 @@ .add(MI.getOperand(PredIdx)) .add(MI.getOperand(SrcIdx)); break; + case AArch64::DestructiveBinary: case AArch64::DestructiveBinaryImm: case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -433,7 +433,7 @@ defm ORR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; defm EOR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; defm AND_ZPZZ : sve_int_bin_pred_zeroing_bhsd; - defm BIC_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm BIC_ZPZZ : sve_int_bin_pred_zeroing_bhsd; } // End HasSVEorSME, UseExperimentalZeroingPseudos let Predicates = [HasSVEorSME] in { Index: llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll @@ -344,8 +344,7 @@ define @bic_i8_zero( %pg, %a, %b) { ; CHECK-LABEL: bic_i8_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.b, #0 // =0x0 -; CHECK-NEXT: sel z0.b, p0, z0.b, z2.b +; CHECK-NEXT: movprfx z0.b, p0/z, z0.b ; CHECK-NEXT: bic z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret %a_z = select %pg, %a, zeroinitializer @@ -358,8 +357,7 @@ define @bic_i16_zero( %pg, %a, %b) { ; CHECK-LABEL: bic_i16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 -; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h ; CHECK-NEXT: bic z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %a_z = select %pg, %a, zeroinitializer @@ -372,8 +370,7 @@ define @bic_i32_zero( %pg, %a, %b) { ; CHECK-LABEL: bic_i32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.s, #0 // =0x0 -; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s +; CHECK-NEXT: movprfx z0.s, p0/z, z0.s ; CHECK-NEXT: bic z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a_z = select %pg, %a, zeroinitializer @@ -386,8 +383,7 @@ define @bic_i64_zero( %pg, %a, %b) { ; CHECK-LABEL: bic_i64_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.d, #0 // =0x0 -; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: movprfx z0.d, p0/z, z0.d ; CHECK-NEXT: bic z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %a_z = select %pg, %a, zeroinitializer @@ -397,6 +393,39 @@ ret %out } +; BIC (i.e. A & ~A) is illegal operation with movprfx, so the codegen depend on IR before expand-pseudo +define @bic_i64_zero_no_unique_reg( %pg, %a) { +; CHECK-LABEL: bic_i64_zero_no_unique_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: mov z1.d, p0/m, z0.d +; CHECK-NEXT: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: bic z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.bic.nxv2i64( %pg, + %a_z, + %a_z) + ret %out +} + +; BIC (i.e. A & ~B) is not a commutative operation, so disable it when the +; destination operand is not the destructive operand +define @bic_i64_zero_no_comm( %pg, %a, %b) { +; CHECK-LABEL: bic_i64_zero_no_comm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: bic z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.bic.nxv2i64( %pg, + %b, + %a_z) + ret %out +} + declare @llvm.aarch64.sve.add.nxv16i8(, , ) declare @llvm.aarch64.sve.add.nxv8i16(, , ) declare @llvm.aarch64.sve.add.nxv4i32(, , ) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir @@ -0,0 +1,39 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+use-experimental-zeroing-pseudos -run-pass=aarch64-expand-pseudo %s -o - | FileCheck %s + +# Should create an additional LSL to zero the lanes as the DstReg is not unique + +--- | + define @bic_i16_zero( %pg, %a){ + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.bic.nxv8i16( %pg, %a_z, %a_z) + ret %out + } + + declare @llvm.aarch64.sve.bic.nxv8i16(, , ) +... +--- +name: bic_i16_zero +alignment: 4 +tracksRegLiveness: true +tracksDebugUserValues: true +registers: [] +liveins: + - { reg: '$p0', virtual-reg: '' } + - { reg: '$z0', virtual-reg: '' } +body: | + bb.0 (%ir-block.0): + liveins: $p0, $z0 + + ; CHECK-LABEL: name: bic_i16_zero + ; CHECK: liveins: $p0, $z0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BUNDLE implicit-def $z0, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $h0, implicit-def $b0, implicit-def $z0_hi, implicit killed $p0, implicit $z0 { + ; CHECK-NEXT: $z0 = MOVPRFX_ZPzZ_H $p0, $z0 + ; CHECK-NEXT: $z0 = LSL_ZPmI_H killed renamable $p0, internal $z0, 0 + ; CHECK-NEXT: $z0 = BIC_ZPmZ_H killed renamable $p0, internal killed $z0, internal killed renamable $z0 + ; CHECK-NEXT: } + ; CHECK-NEXT: RET undef $lr, implicit $z0 + renamable $z0 = BIC_ZPZZ_ZERO_H killed renamable $p0, killed renamable $z0, killed renamable $z0 + RET_ReallyLR implicit $z0 +...