Index: llvm/include/llvm/Target/GlobalISel/Target.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Target.td +++ llvm/include/llvm/Target/GlobalISel/Target.td @@ -22,6 +22,8 @@ def s32 : LLT; def s64 : LLT; +def v2s32 : LLT; +def v4s16 : LLT; // Defines a matcher for complex operands. This is analogous to ComplexPattern // from SelectionDAG. Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -420,7 +420,6 @@ bool tryBitfieldInsertOp(SDNode *N); bool tryBitfieldInsertInZeroOp(SDNode *N); bool tryShiftAmountMod(SDNode *N); - bool tryHighFPExt(SDNode *N); bool tryReadRegister(SDNode *N); bool tryWriteRegister(SDNode *N); @@ -2470,35 +2469,6 @@ return true; } -/// Try to form fcvtl2 instructions from a floating-point extend of a high-half -/// extract of a subvector. -bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) { - assert(N->getOpcode() == ISD::FP_EXTEND); - - // There are 2 forms of fcvtl2 - extend to double or extend to float. - SDValue Extract = N->getOperand(0); - EVT VT = N->getValueType(0); - EVT NarrowVT = Extract.getValueType(); - if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) && - (VT != MVT::v4f32 || NarrowVT != MVT::v4f16)) - return false; - - // Optionally look past a bitcast. - Extract = peekThroughBitcasts(Extract); - if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR) - return false; - - // Match extract from start of high half index. - // Example: v8i16 -> v4i16 means the extract must begin at index 4. - unsigned ExtractIndex = Extract.getConstantOperandVal(1); - if (ExtractIndex != Extract.getValueType().getVectorNumElements()) - return false; - - auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16; - CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0)); - return true; -} - static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, SDValue &Opd0, unsigned &Immr, unsigned &Imms, unsigned NumberOfIgnoredLowBits = 0, @@ -4272,11 +4242,6 @@ return; break; - case ISD::FP_EXTEND: - if (tryHighFPExt(Node)) - return; - break; - case ISD::OR: if (tryBitfieldInsertOp(Node)) return; Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -133,6 +133,18 @@ def extract_high_v2i64 : ComplexPattern; +def extract_high_v8f16 : + ComplexPattern; +def extract_high_v4f32 : + ComplexPattern; + +def gi_extract_high_v8f16 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; +def gi_extract_high_v4f32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def extract_high_dup_v8i16 : BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>; def extract_high_dup_v4i32 : Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4690,11 +4690,16 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), - (i64 4)))), + (i64 4)))), + (FCVTLv8i16 V128:$Rn)>; +def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), + (FCVTLv2i32 V64:$Rn)>; +def : Pat<(v2f64 (any_fpextend (v2f32 (extract_high_v4f32 (v4f32 V128:$Rn))))), + (FCVTLv4i32 V128:$Rn)>; +def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))), + (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (any_fpextend (v4f16 (extract_high_v8f16 (v8f16 V128:$Rn))))), (FCVTLv8i16 V128:$Rn)>; -def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; - -def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; Index: llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -436,6 +436,8 @@ ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; + ComplexRendererFns selectExtractHigh(MachineOperand &Root) const; + void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, @@ -6877,6 +6879,23 @@ }}}; } +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const { + if (!Root.isReg()) + return std::nullopt; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + MachineInstr *Extract = getDefIgnoringCopies(Root.getReg(), MRI); + if (Extract && Extract->getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + Root.getReg() == Extract->getOperand(1).getReg()) { + Register ExtReg = Extract->getOperand(2).getReg(); + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; + } + + return std::nullopt; +} + void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { Index: llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll +++ llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll @@ -74,11 +74,10 @@ define <4 x i64> @fptosi_v4i64_v4f32(<4 x float> %x) #0 { ; CHECK-LABEL: fptosi_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fcvtl2 v1.2d, v0.4s ; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: ret %val = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0 ret <4 x i64> %val @@ -87,11 +86,10 @@ define <4 x i64> @fptoui_v4i64_v4f32(<4 x float> %x) #0 { ; CHECK-LABEL: fptoui_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fcvtl2 v1.2d, v0.4s ; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: fcvtzu v0.2d, v0.2d ; CHECK-NEXT: fcvtzu v1.2d, v1.2d +; CHECK-NEXT: fcvtzu v0.2d, v0.2d ; CHECK-NEXT: ret %val = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0 ret <4 x i64> %val Index: llvm/test/CodeGen/AArch64/fpext.ll =================================================================== --- llvm/test/CodeGen/AArch64/fpext.ll +++ llvm/test/CodeGen/AArch64/fpext.ll @@ -75,9 +75,9 @@ ; ; CHECK-GI-LABEL: fpext_v4f32_v4f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s -; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s +; CHECK-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-GI-NEXT: fcvtl2 v1.2d, v0.4s +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %c = fpext <4 x float> %a to <4 x double> @@ -224,9 +224,9 @@ ; ; CHECK-GI-LABEL: fpext_v8f16_v8f32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h -; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h +; CHECK-GI-NEXT: fcvtl v2.4s, v0.4h +; CHECK-GI-NEXT: fcvtl2 v1.4s, v0.8h +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %c = fpext <8 x half> %a to <8 x float>