Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -867,8 +867,15 @@ Reg = N.getOperand(0); // Don't match if free 32-bit -> 64-bit zext can be used instead. - if (Ext == AArch64_AM::UXTW && - Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode())) + auto isDef32 = [](SDValue N) { + unsigned Opc = N.getOpcode(); + return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && + Opc != ISD::CopyFromReg && Opc != ISD::AssertSext && + Opc != ISD::AssertZext && Opc != ISD::AssertAlign && + Opc != ISD::FREEZE; + }; + if (Ext == AArch64_AM::UXTW && Reg->getValueType(0).getSizeInBits() == 32 && + isDef32(Reg)) return false; } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -457,23 +457,6 @@ } // end namespace AArch64ISD -namespace { - -// Any instruction that defines a 32-bit result zeros out the high half of the -// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. But any other 32-bit operation will zero-extend -// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper -// 32 bits, they're probably just qualifying a CopyFromReg. -static inline bool isDef32(const SDNode &N) { - unsigned Opc = N.getOpcode(); - return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && - Opc != ISD::CopyFromReg && Opc != ISD::AssertSext && - Opc != ISD::AssertZext && Opc != ISD::AssertAlign && - Opc != ISD::FREEZE; -} - -} // end anonymous namespace - namespace AArch64 { /// Possible values of current rounding mode, which is specified in bits /// 23:22 of FPCR. Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7259,14 +7259,6 @@ //---------------------------------------------------------------------------- // FIXME: Like for X86, these should go in their own separate .td file. -def def32 : PatLeaf<(i32 GPR32:$src), [{ - return isDef32(*N); -}]>; - -// In the case of a 32-bit def that is known to implicitly zero-extend, -// we can use a SUBREG_TO_REG. -def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; - // For an anyext, we don't care what the high bits are, so we can perform an // INSERT_SUBREF into an IMPLICIT_DEF. def : Pat<(i64 (anyext GPR32:$src)), Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -195,6 +195,29 @@ }); } +// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC +// +// When you use the 32-bit form of an instruction, the upper 32 bits of the +// source registers are ignored and the upper 32 bits of the destination +// register are set to zero. +// +// If AArch64's 32-bit form of instruction defines the source operand of +// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is +// real AArch64 instruction and if it is not, do not process the opcode +// conservatively. +static bool isFreeZExtOfGPR32(MachineInstr *MI, MachineRegisterInfo *MRI) { + // A COPY from and FPR will become a FMOVSWr. + if (MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getReg().isVirtual()) { + const TargetRegisterClass *RC = + MRI->getRegClass(MI->getOperand(1).getReg()); + if (RC == &AArch64::FPR32RegClass || RC == &AArch64::FPR64RegClass || + RC == &AArch64::FPR128RegClass) + return true; + } + return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; +} + bool AArch64MIPeepholeOpt::visitORR( MachineInstr &MI, SmallSetVector &ToBeRemoved) { // Check this ORR comes from below zero-extend pattern. @@ -208,20 +231,7 @@ return false; MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); - if (!SrcMI) - return false; - - // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC - // - // When you use the 32-bit form of an instruction, the upper 32 bits of the - // source registers are ignored and the upper 32 bits of the destination - // register are set to zero. - // - // If AArch64's 32-bit form of instruction defines the source operand of - // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is - // real AArch64 instruction and if it is not, do not process the opcode - // conservatively. - if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) + if (!SrcMI || !isFreeZExtOfGPR32(SrcMI, MRI)) return false; Register DefReg = MI.getOperand(0).getReg(); Index: llvm/test/CodeGen/AArch64/arm64-popcnt.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -35,8 +35,8 @@ ; CHECK-LABEL: cnt32_advsimd_2: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: cnt.8b v0, v0 ; CHECK-NEXT: uaddlv.8b h0, v0 ; CHECK-NEXT: fmov w0, s0 Index: llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll +++ llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll @@ -328,7 +328,8 @@ define i64 @sign_extend_inreg_isdef32(i64) { ; CHECK-LABEL: sign_extend_inreg_isdef32: ; CHECK: ; %bb.0: -; CHECK-NEXT: sbfx x0, x0, #32, #16 +; CHECK-NEXT: sbfx x8, x0, #32, #16 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %2 = lshr i64 %0, 32 %3 = shl i64 %2, 16 Index: llvm/test/CodeGen/AArch64/dp1.ll =================================================================== --- llvm/test/CodeGen/AArch64/dp1.ll +++ llvm/test/CodeGen/AArch64/dp1.ll @@ -246,7 +246,6 @@ ; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b ; CHECK-GISEL-NEXT: uaddlv h0, v0.8b ; CHECK-GISEL-NEXT: fmov w9, s0 -; CHECK-GISEL-NEXT: mov w9, w9 ; CHECK-GISEL-NEXT: str x9, [x8] ; CHECK-GISEL-NEXT: ret %val0_tmp = load i64, i64* @var64